Skip to content

Audio API Reference

Complete reference for audio and voice conversation APIs.

Configuration for VAD (Voice Activity Detection) mode.

type VADModeConfig struct {
SilenceDuration time.Duration
MinSpeechDuration time.Duration
MaxTurnDuration time.Duration
SampleRate int
Language string
Voice string
Speed float64
}
FieldTypeDefaultDescription
SilenceDurationtime.Duration800msSilence required to detect turn end
MinSpeechDurationtime.Duration200msMinimum speech before turn can complete
MaxTurnDurationtime.Duration30sMaximum turn length before forcing completion
SampleRateint16000Audio sample rate in Hz
Languagestring”en”Language hint for STT (e.g., “en”, “es”, “fr”)
Voicestring”alloy”TTS voice ID
Speedfloat641.0TTS speech rate (0.5-2.0)
func DefaultVADModeConfig() *VADModeConfig

Returns a VADModeConfig with sensible defaults.

func WithVADMode(sttService stt.Service, ttsService tts.Service, cfg *VADModeConfig) Option

Configures VAD mode for voice conversations with standard text-based LLMs.

Parameters:

  • sttService: Speech-to-text service
  • ttsService: Text-to-speech service
  • cfg: VAD configuration (nil uses defaults)

Example:

sttService := stt.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
ttsService := tts.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
conv, _ := sdk.OpenDuplex("./pack.json", "voice",
sdk.WithVADMode(sttService, ttsService, nil),
)
func WithStreamingConfig(config *providers.StreamingInputConfig) Option

Configures ASM (Audio Streaming Model) mode for native multimodal LLMs.

Parameters:

  • config: Streaming configuration with audio type, sample rate, channels

Example:

conv, _ := sdk.OpenDuplex("./pack.json", "voice",
sdk.WithStreamingConfig(&providers.StreamingInputConfig{
Type: types.ContentTypeAudio,
SampleRate: 16000,
Channels: 1,
}),
)
func WithTurnDetector(detector audio.TurnDetector) Option

Configures a custom turn detector for audio sessions.

Example:

detector := audio.NewSilenceDetector(audio.SilenceConfig{
SilenceThreshold: 500 * time.Millisecond,
MinSpeechDuration: 100 * time.Millisecond,
})
conv, _ := sdk.OpenDuplex("./pack.json", "voice",
sdk.WithTurnDetector(detector),
sdk.WithVADMode(sttService, ttsService, nil),
)
type TurnDetector interface {
// ProcessAudio processes an audio chunk and returns turn state
ProcessAudio(samples []byte, sampleRate int) TurnState
// Reset resets the detector state
Reset()
}
type TurnState int
const (
TurnStateListening TurnState = iota
TurnStateSpeaking
TurnStateComplete
)

Configuration for ASM mode streaming.

type StreamingInputConfig struct {
Type types.ContentType // Audio, video, or text
SampleRate int // Audio sample rate (e.g., 16000)
Channels int // Number of audio channels (1=mono, 2=stereo)
}

VAD-based turn detection and audio accumulation.

config := stage.AudioTurnConfig{
SilenceDuration: 800 * time.Millisecond,
MinSpeechDuration: 200 * time.Millisecond,
MaxTurnDuration: 30 * time.Second,
SampleRate: 16000,
InterruptionHandler: handler,
}
turnStage, _ := stage.NewAudioTurnStage(config)

Speech-to-text transcription stage.

config := stage.STTStageConfig{
Language: "en",
SkipEmpty: true,
MinAudioBytes: 1600, // 50ms at 16kHz
}
sttStage := stage.NewSTTStage(sttService, config)

Text-to-speech with barge-in support.

config := stage.TTSConfig{
Voice: "alloy",
Speed: 1.0,
}
ttsStage := stage.NewTTSStageWithInterruption(ttsService, handler, config)

Bidirectional streaming for native audio LLMs.

duplexStage := stage.NewDuplexProviderStage(session)
type AudioData struct {
Samples []byte // Raw audio bytes
SampleRate int // Sample rate in Hz
Channels int // Number of channels
Format AudioFormat // Audio format (PCM16, etc.)
}
ConstantDescription
AudioFormatPCM1616-bit signed PCM
AudioFormatPCM3232-bit signed PCM
AudioFormatFloat3232-bit float PCM

Coordinates interruption detection between AudioTurnStage and TTSStage.

handler := audio.NewInterruptionHandler()
// Used internally by VAD pipeline
// Automatically handles barge-in detection
var (
ErrAudioSessionClosed = errors.New("audio session closed")
ErrInvalidSampleRate = errors.New("invalid sample rate")
ErrInvalidChannels = errors.New("invalid channel count")
)