Voice Activity Detection

Overview

Voice Activity Detection (VAD) identifies speech segments in audio, filtering out silence and noise. This is useful for:

Preprocessing audio before transcription
Real-time speech detection in voice assistants
Audio segmentation and chunking

Cactus uses the Silero VAD model for accurate, low-latency detection.

cactus_vad

Detect speech segments in audio.

int cactus_vad(
    cactus_model_t model,
    const char* audio_file_path,
    char* response_buffer,
    size_t buffer_size,
    const char* options_json,
    const uint8_t* pcm_buffer,
    size_t pcm_buffer_size
);

model

cactus_model_t

required

Silero VAD model handle from cactus_init

audio_file_path

string

Path to WAV file. NULL if using pcm_buffer

response_buffer

char*

required

Buffer to write JSON response

buffer_size

size_t

required

Size of response buffer

options_json

string

Optional JSON object with VAD parameters

pcm_buffer

uint8_t*

Raw PCM audio (16-bit mono 16kHz). NULL if using audio_file_path

pcm_buffer_size

size_t

Size of PCM buffer in bytes

return

int

Number of bytes written to response_buffer on success, -1 on error

Options JSON

{
  "threshold": 0.5,
  "neg_threshold": 0.35,
  "min_speech_duration_ms": 250,
  "max_speech_duration_s": 30.0,
  "min_silence_duration_ms": 100,
  "speech_pad_ms": 30,
  "window_size_samples": 512,
  "min_silence_at_max_speech": 100,
  "use_max_poss_sil_at_max_speech": false,
  "sampling_rate": 16000
}

threshold

float

default:"0.5"

Speech detection probability threshold (0.0-1.0)

neg_threshold

float

default:"0.35"

Non-speech probability threshold for ending segments

min_speech_duration_ms

int

default:"250"

Minimum duration of speech segment in milliseconds

max_speech_duration_s

float

default:"30.0"

Maximum duration of speech segment in seconds

min_silence_duration_ms

int

default:"100"

Minimum silence duration to split segments

speech_pad_ms

int

default:"30"

Padding to add at start/end of speech segments

window_size_samples

int

default:"512"

Analysis window size (must be 512 or 1024)

sampling_rate

int

default:"16000"

Audio sample rate in Hz

Response Format

{
  "success": true,
  "error": null,
  "segments": [
    {
      "start": 0,
      "end": 24000
    },
    {
      "start": 32000,
      "end": 56000
    }
  ],
  "total_time_ms": 12.5,
  "ram_usage_mb": 45.2
}

success

bool

Whether VAD processing succeeded

error

string | null

Error message if failed

segments

array

Array of speech segments with start/end sample indices

segments[].start

int

Start sample index (at 16kHz)

segments[].end

int

End sample index (at 16kHz)

total_time_ms

float

Processing time in milliseconds

ram_usage_mb

float

Memory usage in megabytes

Example: Detect Speech

#include "cactus_ffi.h"
#include <stdio.h>

int main() {
    // Load Silero VAD model
    cactus_model_t model = cactus_init("/path/to/silero-vad", NULL, false);
    if (!model) {
        fprintf(stderr, "Failed to load VAD model\n");
        return 1;
    }
    
    // Configure VAD
    const char* options = "{"
        "\"threshold\":0.5,"
        "\"min_speech_duration_ms\":250,"
        "\"min_silence_duration_ms\":100"
    "}";
    
    char response[4096];
    int result = cactus_vad(
        model,
        "/path/to/audio.wav",
        response,
        sizeof(response),
        options,
        NULL, 0
    );
    
    if (result > 0) {
        printf("VAD result:\n%s\n", response);
    } else {
        printf("Error: %s\n", cactus_get_last_error());
    }
    
    cactus_destroy(model);
    return 0;
}

Example: VAD + Transcription

#include "cactus_ffi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void transcribe_with_vad(const char* audio_path) {
    // Load models
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    cactus_model_t asr = cactus_init("/path/to/whisper", NULL, false);
    
    // Run VAD
    char vad_response[8192];
    cactus_vad(vad, audio_path, vad_response, sizeof(vad_response), NULL, NULL, 0);
    
    // Parse segments (simplified - use JSON parser in production)
    // Extract start/end sample indices from vad_response
    
    // Transcribe with VAD-enabled option
    const char* prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>";
    const char* options = "{\"use_vad\":true}";
    
    char transcript[16384];
    int result = cactus_transcribe(
        asr,
        audio_path,
        prompt,
        transcript,
        sizeof(transcript),
        options,
        NULL, NULL,
        NULL, 0
    );
    
    if (result > 0) {
        printf("Transcription:\n%s\n", transcript);
    }
    
    cactus_destroy(vad);
    cactus_destroy(asr);
}

Example: Real-time VAD

#include "cactus_ffi.h"
#include <stdbool.h>

typedef struct {
    bool in_speech;
    size_t speech_start;
    size_t current_sample;
} VADState;

void process_audio_chunk(
    cactus_model_t vad,
    const int16_t* pcm_samples,
    size_t num_samples,
    VADState* state
) {
    // Convert to uint8_t buffer
    size_t buffer_size = num_samples * 2;
    uint8_t* pcm_buffer = (uint8_t*)pcm_samples;
    
    char response[4096];
    int result = cactus_vad(
        vad,
        NULL,
        response,
        sizeof(response),
        NULL,
        pcm_buffer,
        buffer_size
    );
    
    if (result > 0) {
        // Parse response to check if speech detected
        // Update state->in_speech, state->speech_start
        
        if (state->in_speech) {
            printf("Speech detected at sample %zu\n", state->current_sample);
        }
    }
    
    state->current_sample += num_samples;
}

int main() {
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    
    VADState state = {.in_speech = false, .speech_start = 0, .current_sample = 0};
    
    // Process audio stream in chunks
    while (has_audio_data()) {
        int16_t chunk[8000];  // 500ms at 16kHz
        size_t chunk_size = read_audio_chunk(chunk, 8000);
        process_audio_chunk(vad, chunk, chunk_size, &state);
    }
    
    cactus_destroy(vad);
    return 0;
}

Converting Samples to Time

At 16kHz sample rate:

// Sample index to milliseconds
float sample_to_ms(size_t sample_idx) {
    return (float)sample_idx / 16.0f;
}

// Sample index to seconds
float sample_to_sec(size_t sample_idx) {
    return (float)sample_idx / 16000.0f;
}

// Milliseconds to sample index
size_t ms_to_sample(float ms) {
    return (size_t)(ms * 16.0f);
}

Audio Format Requirements

VAD requires:

Sample rate: 16 kHz
Channels: Mono (1 channel)
Format: 16-bit signed PCM

WAV files are automatically resampled. Raw PCM buffers must already be 16kHz mono.

Tuning Parameters

High Precision (fewer false positives)

{
  "threshold": 0.7,
  "neg_threshold": 0.5,
  "min_speech_duration_ms": 500
}

High Recall (fewer missed segments)

{
  "threshold": 0.3,
  "neg_threshold": 0.2,
  "min_speech_duration_ms": 100
}

Noisy Environment

{
  "threshold": 0.6,
  "speech_pad_ms": 50,
  "min_silence_duration_ms": 200
}

Performance

Latency: ~2ms per 512-sample window (32ms audio)
Throughput: ~500x real-time on CPU
Memory: ~45 MB

Transcription API

Speech-to-text with VAD

Python SDK

Python VAD API

Transcription Guide

Integrate VAD with ASR

​Overview

​cactus_vad

​Options JSON

​Response Format

​Example: Detect Speech

​Example: VAD + Transcription

​Example: Real-time VAD

​Converting Samples to Time

​Audio Format Requirements

​Tuning Parameters

​High Precision (fewer false positives)

​High Recall (fewer missed segments)

​Noisy Environment

​Performance

​See Also

Transcription API

Python SDK

Transcription Guide

Overview

cactus_vad

Options JSON

Response Format

Example: Detect Speech

Example: VAD + Transcription

Example: Real-time VAD

Converting Samples to Time

Audio Format Requirements

Tuning Parameters

High Precision (fewer false positives)

High Recall (fewer missed segments)

Noisy Environment

Performance

See Also