Computation Graph

Overview

CactusGraph is a low-level computation graph abstraction for building neural network operations. It supports operator fusion, mixed precision, and memory pooling.

CactusGraph Class

Constructor

CactusGraph();

Input Operations

input

Create an input node.

size_t input(
    const std::vector<size_t>& shape,
    Precision precision = Precision::INT8
);

shape

vector<size_t>

required

Tensor dimensions (e.g., )

precision

Precision

default:"INT8"

Data type: INT8, FP16, or FP32

return

size_t

Node ID for connecting to other operations

set_input

Populate input node with data.

void set_input(size_t node_id, const void* data, Precision precision);

set_external_input

Use external memory buffer (zero-copy).

void set_external_input(size_t node_id, void* data, Precision precision);

Arithmetic Operations

Element-wise Binary

size_t add(size_t input1, size_t input2);
size_t subtract(size_t input1, size_t input2);
size_t multiply(size_t input1, size_t input2);
size_t divide(size_t input1, size_t input2);

Element-wise Scalar

size_t scalar_add(size_t input, float value);
size_t scalar_multiply(size_t input, float value);
size_t scalar_exp(size_t input);
size_t scalar_sqrt(size_t input);
size_t scalar_log(size_t input);

Linear Algebra

matmul

Matrix multiplication.

size_t matmul(
    size_t input1,
    size_t input2,
    bool pretransposed_rhs = false,
    ComputeBackend backend = ComputeBackend::CPU
);

input1

size_t

required

Left matrix node ID

input2

size_t

required

Right matrix node ID

pretransposed_rhs

bool

default:"false"

Whether right matrix is already transposed

backend

ComputeBackend

default:"CPU"

CPU or NPU execution

transpose

size_t transpose(
    size_t input,
    ComputeBackend backend = ComputeBackend::CPU
);

Transformer Operations

rms_norm

Root mean square normalization.

size_t rms_norm(
    size_t input,
    size_t weight,
    float epsilon = 1e-5f
);

rope

Rotary position embedding.

size_t rope(
    size_t input,
    float theta,
    size_t position_offset = 0,
    ComputeBackend backend = ComputeBackend::CPU
);

theta

float

required

Rotation frequency base (typically 10000.0 or 1000000.0)

position_offset

size_t

default:"0"

Starting position for incremental decoding

attention

Scaled dot-product attention.

size_t attention(
    size_t query,
    size_t key,
    size_t value,
    float scale,
    bool is_causal = true,
    ComputeBackend backend = ComputeBackend::CPU
);

query

size_t

required

Query tensor node ID

key

size_t

required

Key tensor node ID

value

size_t

required

Value tensor node ID

scale

float

required

Attention scale factor (typically 1/sqrt(head_dim))

is_causal

bool

default:"true"

Apply causal mask for autoregressive generation

attention_int8_hybrid

Hybrid attention with INT8 cached KV.

size_t attention_int8_hybrid(
    size_t query,
    size_t key_new,
    size_t value_new,
    float scale,
    size_t position_offset,
    const int8_t* cached_keys,
    const int8_t* cached_values,
    const float* k_scales,
    const float* v_scales,
    size_t cache_len,
    size_t num_kv_heads,
    size_t head_dim,
    size_t window_size = 0
);

Activation Functions

size_t relu(size_t input);
size_t silu(size_t input);
size_t gelu(size_t input);
size_t sigmoid(size_t input);
size_t tanh(size_t input);
size_t glu(size_t input, int axis = -1);

Shape Operations

reshape

size_t reshape(size_t input, const std::vector<size_t>& new_shape);

slice

size_t slice(size_t input, int axis, size_t start, size_t length);

concat

size_t concat(size_t input1, size_t input2, int axis = 0);

Reduction Operations

size_t sum(size_t input, int axis);
size_t mean(size_t input, int axis);
size_t max(size_t input, int axis);

Weight Loading

mmap_weights

Memory-map weight file.

size_t mmap_weights(const std::string& filename);

set_grouped_scales

Attach quantization scales for grouped INT8/INT4.

void set_grouped_scales(
    size_t node_id,
    size_t group_size,
    size_t num_groups,
    void* scales_ptr
);

Execution

execute

Run computation graph.

void execute(const std::string& profile_file = "");

get_output

Retrieve node output.

void* get_output(size_t node_id);

soft_reset

Clear activations, keep weights.

void soft_reset();

hard_reset

Clear all memory.

void hard_reset();

Enums

enum class Precision {
    INT8,
    FP16,
    FP32,
    INT4
};

enum class ComputeBackend {
    CPU,
    NPU
};

Example: Matrix Multiplication

#include "cactus/graph/graph.h"

CactusGraph graph;

// Create inputs
auto a = graph.input({128, 512}, Precision::FP16);
auto b = graph.input({512, 256}, Precision::FP16);

// Matmul operation
auto c = graph.matmul(a, b, false, ComputeBackend::CPU);

// Set input data
std::vector<__fp16> a_data(128 * 512);
std::vector<__fp16> b_data(512 * 256);
graph.set_input(a, a_data.data(), Precision::FP16);
graph.set_input(b, b_data.data(), Precision::FP16);

// Execute
graph.execute();

// Get result
auto* result = static_cast<__fp16*>(graph.get_output(c));

Model API

High-level model interface

Advanced Guide

Graph optimization techniques

​Overview

​CactusGraph Class

​Constructor

​Input Operations

​input

​set_input

​set_external_input

​Arithmetic Operations

​Element-wise Binary

​Element-wise Scalar

​Linear Algebra

​matmul

​transpose

​Transformer Operations

​rms_norm

​rope

​attention

​attention_int8_hybrid

​Activation Functions

​Shape Operations

​reshape

​slice

​concat

​Reduction Operations

​Weight Loading

​mmap_weights

​set_grouped_scales

​Execution

​execute

​get_output

​soft_reset

​hard_reset

​Enums

​Example: Matrix Multiplication

​See Also

Model API

Advanced Guide

Overview

CactusGraph Class

Constructor

Input Operations

input

set_input

set_external_input

Arithmetic Operations

Element-wise Binary

Element-wise Scalar

Linear Algebra

matmul

transpose

Transformer Operations

rms_norm

rope

attention

attention_int8_hybrid

Activation Functions

Shape Operations

reshape

slice

concat

Reduction Operations

Weight Loading

mmap_weights

set_grouped_scales

Execution

execute

get_output

soft_reset

hard_reset

Enums

Example: Matrix Multiplication

See Also