Model & Engine

Overview

The Model class is the central abstraction for inference. All model types (LLM, VLM, ASR, VAD) inherit from the base Model class.

Model Class

Constructor

Model();
explicit Model(const Config& config);

Create a new model instance with optional configuration.

init

Initialize model from disk.

bool init(
    const std::string& model_folder,
    size_t context_size,
    const std::string& system_prompt = "",
    bool do_warmup = true
);

model_folder

string

required

Path to directory containing model weights and config

context_size

size_t

required

Maximum context window (e.g., 2048)

system_prompt

string

System message to prepend to conversations

do_warmup

bool

default:"true"

Run warmup inference to initialize caches

return

bool

true on success, false on error

decode

Generate next token from input tokens.

uint32_t decode(
    const std::vector<uint32_t>& tokens,
    float temperature = -1.0f,
    float top_p = -1.0f,
    size_t top_k = 0,
    const std::string& profile_file = "",
    float* out_entropy = nullptr
);

tokens

vector<uint32_t>

required

Input token sequence

temperature

float

default:"-1.0"

Sampling temperature. -1 uses model default

top_p

float

default:"-1.0"

Nucleus sampling threshold. -1 uses model default

top_k

size_t

default:"0"

Top-k sampling limit. 0 uses model default

profile_file

string

Path to save performance profiling data

out_entropy

float*

Output parameter for token entropy (confidence)

return

uint32_t

Next token ID

prefill

Process prompt tokens without generating output.

void prefill(
    const std::vector<uint32_t>& tokens,
    size_t chunk_size = 256,
    const std::string& profile_file = ""
);

tokens

vector<uint32_t>

required

Tokens to process into KV cache

chunk_size

size_t

default:"256"

Batch size for chunked prefill

get_embeddings

Extract text embeddings.

std::vector<float> get_embeddings(
    const std::vector<uint32_t>& tokens,
    bool pooled = true,
    bool normalize = false,
    const std::string& profile_file = ""
);

tokens

vector<uint32_t>

required

Input token sequence

pooled

bool

default:"true"

Return mean-pooled embedding (single vector)

normalize

bool

default:"false"

L2-normalize output vectors

return

vector<float>

Embedding vector(s)

reset_cache

Clear KV cache for new conversation.

void reset_cache();

set_cache_window

Configure sliding window attention.

void set_cache_window(size_t window_size, size_t sink_size = 4);

window_size

size_t

required

Maximum tokens to keep in cache

sink_size

size_t

default:"4"

Number of initial tokens to always preserve

Config Struct

Model configuration loaded from config.json:

struct Config {
    uint32_t vocab_size = 151936;
    uint32_t num_layers = 28;
    uint32_t hidden_dim = 1024;
    uint32_t attention_heads = 16;
    uint32_t attention_kv_heads = 8;
    float layer_norm_eps = 1e-6f;
    float rope_theta = 1000000.0f;
    
    // Defaults for generation
    float default_temperature = 0.6f;
    float default_top_p = 0.95f;
    size_t default_top_k = 20;
    
    ModelType model_type = ModelType::QWEN;
    Backend default_backend = Backend::CPU;
    Precision precision = Precision::FP32;
};

Key Enums

enum class ModelType {
    QWEN = 0,
    GEMMA = 1,
    NOMIC = 3,
    LFM2 = 5,
    SIGLIP2 = 6,
    WHISPER = 7,
    MOONSHINE = 8,
    SILERO_VAD = 9,
    PARAKEET = 10,
    PARAKEET_TDT = 11
};

enum class Backend {
    CPU = 0,
    NPU = 1
};

enum class Precision {
    INT8 = 0,
    FP16 = 1,
    FP32 = 2
};

Tokenizer

encode

std::vector<uint32_t> encode(const std::string& text) const;

decode

std::string decode(const std::vector<uint32_t>& tokens) const;

apply_chat_template

std::vector<uint32_t> apply_chat_template(
    const std::vector<ChatMessage>& messages,
    bool add_generation_prompt = true
) const;

Example Usage

#include "cactus/engine/engine.h"

using namespace cactus::engine;

// Load model
auto model = create_model("/path/to/model");
model->init("/path/to/model", 2048, "You are a helpful assistant.");

// Encode input
auto tokenizer = model->get_tokenizer();
auto tokens = tokenizer->encode("Hello, world!");

// Generate tokens
for (int i = 0; i < 50; i++) {
    uint32_t next_token = model->decode(tokens, 0.7f, 0.95f, 20);
    tokens.push_back(next_token);
    
    if (next_token == tokenizer->get_eos_token()) break;
}

// Decode output
std::string response = tokenizer->decode(tokens);

Completion API

High-level completion interface

Graph API

Computation graph builder

C FFI

Foreign function interface

Advanced Guide

Model optimization techniques

Core APIs

Features

Model & Engine

Overview

Model Class

Constructor

init

decode

prefill

get_embeddings

reset_cache

set_cache_window

Config Struct

Key Enums

Tokenizer

encode

decode

apply_chat_template

Example Usage

See Also

Completion API

Graph API

C FFI

Advanced Guide

Core APIs

Features

​Overview

​Model Class

​Constructor

​init

​decode

​prefill

​get_embeddings

​reset_cache

​set_cache_window

​Config Struct

​Key Enums

​Tokenizer

​encode

​decode

​apply_chat_template

​Example Usage

​See Also

Completion API

Graph API

C FFI

Advanced Guide

Overview

Model Class

Constructor

init

decode

prefill

get_embeddings

reset_cache

set_cache_window

Config Struct

Key Enums

Tokenizer

encode

decode

apply_chat_template

Example Usage

See Also