Skip to content

Tutorial 8: TTS Integration

Add text-to-speech capabilities to your conversations.

  • Setting up TTS providers (OpenAI, ElevenLabs, Cartesia)
  • Configuring voice, speed, and audio formats
  • Streaming vs single-shot synthesis
  • Integrating TTS with conversations

All TTS providers implement the same interface:

type Service interface {
Name() string
Synthesize(ctx context.Context, text string, config SynthesisConfig) (io.ReadCloser, error)
SupportedVoices() []Voice
SupportedFormats() []AudioFormat
}
import "github.com/AltairaLabs/PromptKit/runtime/tts"
// Create OpenAI TTS service
ttsService := tts.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
// Available voices: alloy, echo, fable, onyx, nova, shimmer
// Available models: tts-1 (fast), tts-1-hd (high quality)
import "github.com/AltairaLabs/PromptKit/runtime/tts"
// Create ElevenLabs TTS service
ttsService := tts.NewElevenLabs(os.Getenv("ELEVENLABS_API_KEY"))
// Wide variety of voices available
// Check SupportedVoices() for options
import "github.com/AltairaLabs/PromptKit/runtime/tts"
// Create Cartesia TTS service
ttsService := tts.NewCartesia(os.Getenv("CARTESIA_API_KEY"))
// Supports interactive streaming mode for low latency
config := tts.SynthesisConfig{
Voice: "nova", // Voice ID
Format: tts.FormatMP3, // Output format
Speed: 1.0, // Speech rate (0.25-4.0)
Pitch: 0, // Pitch adjustment (-20 to 20)
Language: "en-US", // Language code
Model: "tts-1-hd", // Model (provider-specific)
}
FormatConstantUse Case
MP3tts.FormatMP3Most compatible
Opustts.FormatOpusBest for streaming
AACtts.FormatAACApple devices
FLACtts.FormatFLACLossless quality
PCMtts.FormatPCM16Raw audio processing
WAVtts.FormatWAVPCM with header
ctx := context.Background()
config := tts.SynthesisConfig{
Voice: "alloy",
Format: tts.FormatMP3,
Speed: 1.0,
}
// Synthesize text to audio
reader, err := ttsService.Synthesize(ctx, "Hello, how can I help you?", config)
if err != nil {
log.Fatal(err)
}
defer reader.Close()
// Read audio data
audioData, _ := io.ReadAll(reader)
// Play or save audioData...

For lower latency, use streaming synthesis (if supported):

// Check if provider supports streaming
streamingService, ok := ttsService.(tts.StreamingService)
if !ok {
log.Fatal("Provider doesn't support streaming")
}
// Start streaming synthesis
chunks, err := streamingService.SynthesizeStream(ctx, "Hello, how can I help you?", config)
if err != nil {
log.Fatal(err)
}
// Process chunks as they arrive
for chunk := range chunks {
if chunk.Error != nil {
log.Printf("Error: %v", chunk.Error)
break
}
playAudioChunk(chunk.Data)
if chunk.Final {
break
}
}
sttService := stt.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
ttsService := tts.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
// Configure VAD mode with custom TTS settings
vadConfig := &sdk.VADModeConfig{
Voice: "nova", // Use Nova voice
Speed: 1.1, // Slightly faster
}
conv, _ := sdk.OpenDuplex("./assistant.pack.json", "voice",
sdk.WithVADMode(sttService, ttsService, vadConfig),
)
// Open text conversation
conv, _ := sdk.Open("./assistant.pack.json", "chat")
// Send message and get response
resp, _ := conv.Send(ctx, "Tell me a joke")
// Synthesize the response
ttsService := tts.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
reader, _ := ttsService.Synthesize(ctx, resp.Text(), tts.DefaultSynthesisConfig())
defer reader.Close()
// Play the audio
audioData, _ := io.ReadAll(reader)
playAudio(audioData)
voices := ttsService.SupportedVoices()
for _, voice := range voices {
fmt.Printf("%s: %s (%s, %s)\n",
voice.ID,
voice.Name,
voice.Language,
voice.Gender,
)
}
VoiceCharacter
alloyNeutral, versatile
echoWarm, smooth
fableExpressive, British
onyxDeep, authoritative
novaFriendly, youthful
shimmerClear, professional
reader, err := ttsService.Synthesize(ctx, text, config)
if err != nil {
switch {
case errors.Is(err, tts.ErrInvalidVoice):
log.Printf("Voice '%s' not supported", config.Voice)
case errors.Is(err, tts.ErrRateLimited):
log.Printf("Rate limited, retrying...")
time.Sleep(time.Second)
// Retry...
case errors.Is(err, tts.ErrTextTooLong):
log.Printf("Text exceeds maximum length")
default:
log.Printf("Synthesis failed: %v", err)
}
return
}

For repeated phrases, cache synthesized audio:

var cache = make(map[string][]byte)
func synthesizeWithCache(text string, config tts.SynthesisConfig) ([]byte, error) {
key := text + config.Voice + config.Format.Name
if cached, ok := cache[key]; ok {
return cached, nil
}
reader, err := ttsService.Synthesize(ctx, text, config)
if err != nil {
return nil, err
}
defer reader.Close()
data, err := io.ReadAll(reader)
if err != nil {
return nil, err
}
cache[key] = data
return data, nil
}

For common responses, synthesize in advance:

greetings := []string{
"Hello! How can I help you today?",
"I'm sorry, I didn't catch that.",
"Is there anything else I can help with?",
}
for _, text := range greetings {
reader, _ := ttsService.Synthesize(ctx, text, config)
data, _ := io.ReadAll(reader)
reader.Close()
cache[text] = data
}
  1. Voice Consistency: Use the same voice throughout a conversation
  2. Speed Adjustment: Slower for complex info, faster for casual chat
  3. Format Selection: Use Opus for streaming, MP3 for storage
  4. Error Handling: Gracefully handle synthesis failures
  5. Resource Cleanup: Always close readers when done
ProviderPricing Model
OpenAIPer character
ElevenLabsPer character (tiers)
CartesiaPer character

Estimate costs before production deployment.