Overview
Voice Activity Detection (VAD) identifies speech segments in audio, filtering out silence and noise. This is useful for:
- Preprocessing audio before transcription
- Real-time speech detection in voice assistants
- Audio segmentation and chunking
Cactus uses the Silero VAD model for accurate, low-latency detection.
cactus_vad
Detect speech segments in audio.
int cactus_vad(
cactus_model_t model,
const char* audio_file_path,
char* response_buffer,
size_t buffer_size,
const char* options_json,
const uint8_t* pcm_buffer,
size_t pcm_buffer_size
);
Silero VAD model handle from cactus_init
Path to WAV file. NULL if using pcm_buffer
Buffer to write JSON response
Optional JSON object with VAD parameters
Raw PCM audio (16-bit mono 16kHz). NULL if using audio_file_path
Size of PCM buffer in bytes
Number of bytes written to response_buffer on success, -1 on error
Options JSON
{
"threshold": 0.5,
"neg_threshold": 0.35,
"min_speech_duration_ms": 250,
"max_speech_duration_s": 30.0,
"min_silence_duration_ms": 100,
"speech_pad_ms": 30,
"window_size_samples": 512,
"min_silence_at_max_speech": 100,
"use_max_poss_sil_at_max_speech": false,
"sampling_rate": 16000
}
Speech detection probability threshold (0.0-1.0)
Non-speech probability threshold for ending segments
Minimum duration of speech segment in milliseconds
Maximum duration of speech segment in seconds
Minimum silence duration to split segments
Padding to add at start/end of speech segments
Analysis window size (must be 512 or 1024)
{
"success": true,
"error": null,
"segments": [
{
"start": 0,
"end": 24000
},
{
"start": 32000,
"end": 56000
}
],
"total_time_ms": 12.5,
"ram_usage_mb": 45.2
}
Whether VAD processing succeeded
Array of speech segments with start/end sample indices
Start sample index (at 16kHz)
End sample index (at 16kHz)
Processing time in milliseconds
Memory usage in megabytes
Example: Detect Speech
#include "cactus_ffi.h"
#include <stdio.h>
int main() {
// Load Silero VAD model
cactus_model_t model = cactus_init("/path/to/silero-vad", NULL, false);
if (!model) {
fprintf(stderr, "Failed to load VAD model\n");
return 1;
}
// Configure VAD
const char* options = "{"
"\"threshold\":0.5,"
"\"min_speech_duration_ms\":250,"
"\"min_silence_duration_ms\":100"
"}";
char response[4096];
int result = cactus_vad(
model,
"/path/to/audio.wav",
response,
sizeof(response),
options,
NULL, 0
);
if (result > 0) {
printf("VAD result:\n%s\n", response);
} else {
printf("Error: %s\n", cactus_get_last_error());
}
cactus_destroy(model);
return 0;
}
Example: VAD + Transcription
#include "cactus_ffi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void transcribe_with_vad(const char* audio_path) {
// Load models
cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
cactus_model_t asr = cactus_init("/path/to/whisper", NULL, false);
// Run VAD
char vad_response[8192];
cactus_vad(vad, audio_path, vad_response, sizeof(vad_response), NULL, NULL, 0);
// Parse segments (simplified - use JSON parser in production)
// Extract start/end sample indices from vad_response
// Transcribe with VAD-enabled option
const char* prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>";
const char* options = "{\"use_vad\":true}";
char transcript[16384];
int result = cactus_transcribe(
asr,
audio_path,
prompt,
transcript,
sizeof(transcript),
options,
NULL, NULL,
NULL, 0
);
if (result > 0) {
printf("Transcription:\n%s\n", transcript);
}
cactus_destroy(vad);
cactus_destroy(asr);
}
Example: Real-time VAD
#include "cactus_ffi.h"
#include <stdbool.h>
typedef struct {
bool in_speech;
size_t speech_start;
size_t current_sample;
} VADState;
void process_audio_chunk(
cactus_model_t vad,
const int16_t* pcm_samples,
size_t num_samples,
VADState* state
) {
// Convert to uint8_t buffer
size_t buffer_size = num_samples * 2;
uint8_t* pcm_buffer = (uint8_t*)pcm_samples;
char response[4096];
int result = cactus_vad(
vad,
NULL,
response,
sizeof(response),
NULL,
pcm_buffer,
buffer_size
);
if (result > 0) {
// Parse response to check if speech detected
// Update state->in_speech, state->speech_start
if (state->in_speech) {
printf("Speech detected at sample %zu\n", state->current_sample);
}
}
state->current_sample += num_samples;
}
int main() {
cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
VADState state = {.in_speech = false, .speech_start = 0, .current_sample = 0};
// Process audio stream in chunks
while (has_audio_data()) {
int16_t chunk[8000]; // 500ms at 16kHz
size_t chunk_size = read_audio_chunk(chunk, 8000);
process_audio_chunk(vad, chunk, chunk_size, &state);
}
cactus_destroy(vad);
return 0;
}
Converting Samples to Time
At 16kHz sample rate:
// Sample index to milliseconds
float sample_to_ms(size_t sample_idx) {
return (float)sample_idx / 16.0f;
}
// Sample index to seconds
float sample_to_sec(size_t sample_idx) {
return (float)sample_idx / 16000.0f;
}
// Milliseconds to sample index
size_t ms_to_sample(float ms) {
return (size_t)(ms * 16.0f);
}
VAD requires:
- Sample rate: 16 kHz
- Channels: Mono (1 channel)
- Format: 16-bit signed PCM
WAV files are automatically resampled. Raw PCM buffers must already be 16kHz mono.
Tuning Parameters
High Precision (fewer false positives)
{
"threshold": 0.7,
"neg_threshold": 0.5,
"min_speech_duration_ms": 500
}
High Recall (fewer missed segments)
{
"threshold": 0.3,
"neg_threshold": 0.2,
"min_speech_duration_ms": 100
}
Noisy Environment
{
"threshold": 0.6,
"speech_pad_ms": 50,
"min_silence_duration_ms": 200
}
- Latency: ~2ms per 512-sample window (32ms audio)
- Throughput: ~500x real-time on CPU
- Memory: ~45 MB
See Also
Transcription API
Speech-to-text with VAD
Transcription Guide
Integrate VAD with ASR