Audio API Reference

Complete reference for audio and voice conversation APIs.

VADModeConfig

Configuration for VAD (Voice Activity Detection) mode.

type VADModeConfig struct {
    SilenceDuration   time.Duration
    MinSpeechDuration time.Duration
    MaxTurnDuration   time.Duration
    SampleRate        int
    Language          string
    Voice             string
    Speed             float64
}

Fields

FieldTypeDefaultDescription
SilenceDurationtime.Duration800msSilence required to detect turn end
MinSpeechDurationtime.Duration200msMinimum speech before turn can complete
MaxTurnDurationtime.Duration30sMaximum turn length before forcing completion
SampleRateint16000Audio sample rate in Hz
Languagestring”en”Language hint for STT (e.g., “en”, “es”, “fr”)
Voicestring”alloy”TTS voice ID
Speedfloat641.0TTS speech rate (0.5-2.0)

Constructor

func DefaultVADModeConfig() *VADModeConfig

Returns a VADModeConfig with sensible defaults.

SDK Options

WithVADMode

func WithVADMode(sttService stt.Service, ttsService tts.Service, cfg *VADModeConfig) Option

Configures VAD mode for voice conversations with standard text-based LLMs.

Parameters:

Example:

sttService := stt.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
ttsService := tts.NewOpenAI(os.Getenv("OPENAI_API_KEY"))

conv, _ := sdk.OpenDuplex("./pack.json", "voice",
    sdk.WithVADMode(sttService, ttsService, nil),
)

WithStreamingConfig

func WithStreamingConfig(config *providers.StreamingInputConfig) Option

Configures ASM (Audio Streaming Model) mode for native multimodal LLMs.

Parameters:

Example:

conv, _ := sdk.OpenDuplex("./pack.json", "voice",
    sdk.WithStreamingConfig(&providers.StreamingInputConfig{
        Type:       types.ContentTypeAudio,
        SampleRate: 16000,
        Channels:   1,
    }),
)

WithTurnDetector

func WithTurnDetector(detector audio.TurnDetector) Option

Configures a custom turn detector for audio sessions.

Example:

detector := audio.NewSilenceDetector(audio.SilenceConfig{
    SilenceThreshold:  500 * time.Millisecond,
    MinSpeechDuration: 100 * time.Millisecond,
})

conv, _ := sdk.OpenDuplex("./pack.json", "voice",
    sdk.WithTurnDetector(detector),
    sdk.WithVADMode(sttService, ttsService, nil),
)

TurnDetector Interface

type TurnDetector interface {
    // ProcessAudio processes an audio chunk and returns turn state
    ProcessAudio(samples []byte, sampleRate int) TurnState

    // Reset resets the detector state
    Reset()
}

type TurnState int

const (
    TurnStateListening TurnState = iota
    TurnStateSpeaking
    TurnStateComplete
)

StreamingInputConfig

Configuration for ASM mode streaming.

type StreamingInputConfig struct {
    Type       types.ContentType // Audio, video, or text
    SampleRate int               // Audio sample rate (e.g., 16000)
    Channels   int               // Number of audio channels (1=mono, 2=stereo)
}

Audio Pipeline Stages

AudioTurnStage

VAD-based turn detection and audio accumulation.

config := stage.AudioTurnConfig{
    SilenceDuration:      800 * time.Millisecond,
    MinSpeechDuration:    200 * time.Millisecond,
    MaxTurnDuration:      30 * time.Second,
    SampleRate:           16000,
    InterruptionHandler:  handler,
}

turnStage, _ := stage.NewAudioTurnStage(config)

STTStage

Speech-to-text transcription stage.

config := stage.STTStageConfig{
    Language:      "en",
    SkipEmpty:     true,
    MinAudioBytes: 1600, // 50ms at 16kHz
}

sttStage := stage.NewSTTStage(sttService, config)

TTSStageWithInterruption

Text-to-speech with barge-in support.

config := stage.TTSConfig{
    Voice: "alloy",
    Speed: 1.0,
}

ttsStage := stage.NewTTSStageWithInterruption(ttsService, handler, config)

DuplexProviderStage

Bidirectional streaming for native audio LLMs.

duplexStage := stage.NewDuplexProviderStage(session)

AudioData Type

type AudioData struct {
    Samples    []byte      // Raw audio bytes
    SampleRate int         // Sample rate in Hz
    Channels   int         // Number of channels
    Format     AudioFormat // Audio format (PCM16, etc.)
}

Audio Formats

ConstantDescription
AudioFormatPCM1616-bit signed PCM
AudioFormatPCM3232-bit signed PCM
AudioFormatFloat3232-bit float PCM

InterruptionHandler

Coordinates interruption detection between AudioTurnStage and TTSStage.

handler := audio.NewInterruptionHandler()

// Used internally by VAD pipeline
// Automatically handles barge-in detection

Error Types

var (
    ErrAudioSessionClosed = errors.New("audio session closed")
    ErrInvalidSampleRate  = errors.New("invalid sample rate")
    ErrInvalidChannels    = errors.New("invalid channel count")
)

See Also