Skip to main content

Overview

The Model class is the central abstraction for inference. All model types (LLM, VLM, ASR, VAD) inherit from the base Model class.

Model Class

Constructor

Model();
explicit Model(const Config& config);
Create a new model instance with optional configuration.

init

Initialize model from disk.
bool init(
    const std::string& model_folder,
    size_t context_size,
    const std::string& system_prompt = "",
    bool do_warmup = true
);
model_folder
string
required
Path to directory containing model weights and config
context_size
size_t
required
Maximum context window (e.g., 2048)
system_prompt
string
System message to prepend to conversations
do_warmup
bool
default:"true"
Run warmup inference to initialize caches
return
bool
true on success, false on error

decode

Generate next token from input tokens.
uint32_t decode(
    const std::vector<uint32_t>& tokens,
    float temperature = -1.0f,
    float top_p = -1.0f,
    size_t top_k = 0,
    const std::string& profile_file = "",
    float* out_entropy = nullptr
);
tokens
vector<uint32_t>
required
Input token sequence
temperature
float
default:"-1.0"
Sampling temperature. -1 uses model default
top_p
float
default:"-1.0"
Nucleus sampling threshold. -1 uses model default
top_k
size_t
default:"0"
Top-k sampling limit. 0 uses model default
profile_file
string
Path to save performance profiling data
out_entropy
float*
Output parameter for token entropy (confidence)
return
uint32_t
Next token ID

prefill

Process prompt tokens without generating output.
void prefill(
    const std::vector<uint32_t>& tokens,
    size_t chunk_size = 256,
    const std::string& profile_file = ""
);
tokens
vector<uint32_t>
required
Tokens to process into KV cache
chunk_size
size_t
default:"256"
Batch size for chunked prefill

get_embeddings

Extract text embeddings.
std::vector<float> get_embeddings(
    const std::vector<uint32_t>& tokens,
    bool pooled = true,
    bool normalize = false,
    const std::string& profile_file = ""
);
tokens
vector<uint32_t>
required
Input token sequence
pooled
bool
default:"true"
Return mean-pooled embedding (single vector)
normalize
bool
default:"false"
L2-normalize output vectors
return
vector<float>
Embedding vector(s)

reset_cache

Clear KV cache for new conversation.
void reset_cache();

set_cache_window

Configure sliding window attention.
void set_cache_window(size_t window_size, size_t sink_size = 4);
window_size
size_t
required
Maximum tokens to keep in cache
sink_size
size_t
default:"4"
Number of initial tokens to always preserve

Config Struct

Model configuration loaded from config.json:
struct Config {
    uint32_t vocab_size = 151936;
    uint32_t num_layers = 28;
    uint32_t hidden_dim = 1024;
    uint32_t attention_heads = 16;
    uint32_t attention_kv_heads = 8;
    float layer_norm_eps = 1e-6f;
    float rope_theta = 1000000.0f;
    
    // Defaults for generation
    float default_temperature = 0.6f;
    float default_top_p = 0.95f;
    size_t default_top_k = 20;
    
    ModelType model_type = ModelType::QWEN;
    Backend default_backend = Backend::CPU;
    Precision precision = Precision::FP32;
};

Key Enums

enum class ModelType {
    QWEN = 0,
    GEMMA = 1,
    NOMIC = 3,
    LFM2 = 5,
    SIGLIP2 = 6,
    WHISPER = 7,
    MOONSHINE = 8,
    SILERO_VAD = 9,
    PARAKEET = 10,
    PARAKEET_TDT = 11
};

enum class Backend {
    CPU = 0,
    NPU = 1
};

enum class Precision {
    INT8 = 0,
    FP16 = 1,
    FP32 = 2
};

Tokenizer

encode

std::vector<uint32_t> encode(const std::string& text) const;

decode

std::string decode(const std::vector<uint32_t>& tokens) const;

apply_chat_template

std::vector<uint32_t> apply_chat_template(
    const std::vector<ChatMessage>& messages,
    bool add_generation_prompt = true
) const;

Example Usage

#include "cactus/engine/engine.h"

using namespace cactus::engine;

// Load model
auto model = create_model("/path/to/model");
model->init("/path/to/model", 2048, "You are a helpful assistant.");

// Encode input
auto tokenizer = model->get_tokenizer();
auto tokens = tokenizer->encode("Hello, world!");

// Generate tokens
for (int i = 0; i < 50; i++) {
    uint32_t next_token = model->decode(tokens, 0.7f, 0.95f, 20);
    tokens.push_back(next_token);
    
    if (next_token == tokenizer->get_eos_token()) break;
}

// Decode output
std::string response = tokenizer->decode(tokens);

See Also

Completion API

High-level completion interface

Graph API

Computation graph builder

C FFI

Foreign function interface

Advanced Guide

Model optimization techniques