Skip to main content

Overview

CactusGraph is a low-level computation graph abstraction for building neural network operations. It supports operator fusion, mixed precision, and memory pooling.

CactusGraph Class

Constructor

CactusGraph();

Input Operations

input

Create an input node.
size_t input(
    const std::vector<size_t>& shape,
    Precision precision = Precision::INT8
);
shape
vector<size_t>
required
Tensor dimensions (e.g., )
precision
Precision
default:"INT8"
Data type: INT8, FP16, or FP32
return
size_t
Node ID for connecting to other operations

set_input

Populate input node with data.
void set_input(size_t node_id, const void* data, Precision precision);

set_external_input

Use external memory buffer (zero-copy).
void set_external_input(size_t node_id, void* data, Precision precision);

Arithmetic Operations

Element-wise Binary

size_t add(size_t input1, size_t input2);
size_t subtract(size_t input1, size_t input2);
size_t multiply(size_t input1, size_t input2);
size_t divide(size_t input1, size_t input2);

Element-wise Scalar

size_t scalar_add(size_t input, float value);
size_t scalar_multiply(size_t input, float value);
size_t scalar_exp(size_t input);
size_t scalar_sqrt(size_t input);
size_t scalar_log(size_t input);

Linear Algebra

matmul

Matrix multiplication.
size_t matmul(
    size_t input1,
    size_t input2,
    bool pretransposed_rhs = false,
    ComputeBackend backend = ComputeBackend::CPU
);
input1
size_t
required
Left matrix node ID
input2
size_t
required
Right matrix node ID
pretransposed_rhs
bool
default:"false"
Whether right matrix is already transposed
backend
ComputeBackend
default:"CPU"
CPU or NPU execution

transpose

size_t transpose(
    size_t input,
    ComputeBackend backend = ComputeBackend::CPU
);

Transformer Operations

rms_norm

Root mean square normalization.
size_t rms_norm(
    size_t input,
    size_t weight,
    float epsilon = 1e-5f
);

rope

Rotary position embedding.
size_t rope(
    size_t input,
    float theta,
    size_t position_offset = 0,
    ComputeBackend backend = ComputeBackend::CPU
);
theta
float
required
Rotation frequency base (typically 10000.0 or 1000000.0)
position_offset
size_t
default:"0"
Starting position for incremental decoding

attention

Scaled dot-product attention.
size_t attention(
    size_t query,
    size_t key,
    size_t value,
    float scale,
    bool is_causal = true,
    ComputeBackend backend = ComputeBackend::CPU
);
query
size_t
required
Query tensor node ID
key
size_t
required
Key tensor node ID
value
size_t
required
Value tensor node ID
scale
float
required
Attention scale factor (typically 1/sqrt(head_dim))
is_causal
bool
default:"true"
Apply causal mask for autoregressive generation

attention_int8_hybrid

Hybrid attention with INT8 cached KV.
size_t attention_int8_hybrid(
    size_t query,
    size_t key_new,
    size_t value_new,
    float scale,
    size_t position_offset,
    const int8_t* cached_keys,
    const int8_t* cached_values,
    const float* k_scales,
    const float* v_scales,
    size_t cache_len,
    size_t num_kv_heads,
    size_t head_dim,
    size_t window_size = 0
);

Activation Functions

size_t relu(size_t input);
size_t silu(size_t input);
size_t gelu(size_t input);
size_t sigmoid(size_t input);
size_t tanh(size_t input);
size_t glu(size_t input, int axis = -1);

Shape Operations

reshape

size_t reshape(size_t input, const std::vector<size_t>& new_shape);

slice

size_t slice(size_t input, int axis, size_t start, size_t length);

concat

size_t concat(size_t input1, size_t input2, int axis = 0);

Reduction Operations

size_t sum(size_t input, int axis);
size_t mean(size_t input, int axis);
size_t max(size_t input, int axis);

Weight Loading

mmap_weights

Memory-map weight file.
size_t mmap_weights(const std::string& filename);

set_grouped_scales

Attach quantization scales for grouped INT8/INT4.
void set_grouped_scales(
    size_t node_id,
    size_t group_size,
    size_t num_groups,
    void* scales_ptr
);

Execution

execute

Run computation graph.
void execute(const std::string& profile_file = "");

get_output

Retrieve node output.
void* get_output(size_t node_id);

soft_reset

Clear activations, keep weights.
void soft_reset();

hard_reset

Clear all memory.
void hard_reset();

Enums

enum class Precision {
    INT8,
    FP16,
    FP32,
    INT4
};

enum class ComputeBackend {
    CPU,
    NPU
};

Example: Matrix Multiplication

#include "cactus/graph/graph.h"

CactusGraph graph;

// Create inputs
auto a = graph.input({128, 512}, Precision::FP16);
auto b = graph.input({512, 256}, Precision::FP16);

// Matmul operation
auto c = graph.matmul(a, b, false, ComputeBackend::CPU);

// Set input data
std::vector<__fp16> a_data(128 * 512);
std::vector<__fp16> b_data(512 * 256);
graph.set_input(a, a_data.data(), Precision::FP16);
graph.set_input(b, b_data.data(), Precision::FP16);

// Execute
graph.execute();

// Get result
auto* result = static_cast<__fp16*>(graph.get_output(c));

See Also

Model API

High-level model interface

Advanced Guide

Graph optimization techniques