Skip to content

TTS API Reference

Complete reference for text-to-speech services.

type Service interface {
Name() string
Synthesize(ctx context.Context, text string, config SynthesisConfig) (io.ReadCloser, error)
SupportedVoices() []Voice
SupportedFormats() []AudioFormat
}
func (s Service) Name() string

Returns the provider identifier (e.g., “openai”, “elevenlabs”).

func (s Service) Synthesize(ctx context.Context, text string, config SynthesisConfig) (io.ReadCloser, error)

Converts text to audio. Returns a reader for streaming audio data. The caller is responsible for closing the reader.

func (s Service) SupportedVoices() []Voice

Returns available voices for this provider.

func (s Service) SupportedFormats() []AudioFormat

Returns supported audio output formats.

type StreamingService interface {
Service
SynthesizeStream(ctx context.Context, text string, config SynthesisConfig) (<-chan AudioChunk, error)
}

Extends Service with streaming synthesis capabilities for lower latency.

type SynthesisConfig struct {
Voice string // Voice ID
Format AudioFormat // Output format
Speed float64 // Speech rate (0.25-4.0)
Pitch float64 // Pitch adjustment (-20 to 20)
Language string // Language code
Model string // TTS model (provider-specific)
}
FieldTypeDefaultDescription
Voicestring”alloy”Voice ID (provider-specific)
FormatAudioFormatMP3Output audio format
Speedfloat641.0Speech rate multiplier
Pitchfloat640Pitch adjustment in semitones
Languagestring""Language code (e.g., “en-US”)
Modelstring""TTS model (e.g., “tts-1-hd”)
func DefaultSynthesisConfig() SynthesisConfig

Returns sensible defaults for synthesis.

type Voice struct {
ID string // Provider-specific identifier
Name string // Human-readable name
Language string // Primary language code
Gender string // "male", "female", "neutral"
Description string // Voice characteristics
Preview string // URL to voice sample
}
type AudioFormat struct {
Name string // Format identifier
MIMEType string // Content type
SampleRate int // Sample rate in Hz
BitDepth int // Bits per sample
Channels int // Number of channels
}
ConstantNameMIME TypeUse Case
FormatMP3mp3audio/mpegMost compatible
FormatOpusopusaudio/opusBest for streaming
FormatAACaacaudio/aacApple devices
FormatFLACflacaudio/flacLossless quality
FormatPCM16pcmaudio/pcmRaw processing
FormatWAVwavaudio/wavPCM with header
type AudioChunk struct {
Data []byte // Raw audio bytes
Index int // Chunk sequence number
Final bool // Last chunk indicator
Error error // Error during synthesis
}
func NewOpenAI(apiKey string) Service

Creates an OpenAI TTS service.

Voices:

IDCharacter
alloyNeutral, versatile
echoWarm, smooth
fableExpressive, British
onyxDeep, authoritative
novaFriendly, youthful
shimmerClear, professional

Models:

  • tts-1: Fast, optimized for real-time
  • tts-1-hd: High quality, longer latency

Example:

service := tts.NewOpenAI(os.Getenv("OPENAI_API_KEY"))
config := tts.SynthesisConfig{
Voice: "nova",
Format: tts.FormatMP3,
Model: "tts-1-hd",
}
reader, _ := service.Synthesize(ctx, "Hello world", config)
func NewElevenLabs(apiKey string) Service

Creates an ElevenLabs TTS service.

Features:

  • Wide variety of voices
  • Voice cloning support
  • Multilingual support

Example:

service := tts.NewElevenLabs(os.Getenv("ELEVENLABS_API_KEY"))
// List available voices
voices := service.SupportedVoices()
for _, v := range voices {
fmt.Printf("%s: %s\n", v.ID, v.Name)
}
func NewCartesia(apiKey string) Service

Creates a Cartesia TTS service.

Features:

  • Ultra-low latency
  • Interactive streaming mode
  • Emotion control

Example:

service := tts.NewCartesia(os.Getenv("CARTESIA_API_KEY"))
var (
ErrInvalidVoice = errors.New("invalid voice")
ErrInvalidFormat = errors.New("unsupported format")
ErrTextTooLong = errors.New("text exceeds maximum length")
ErrRateLimited = errors.New("rate limited")
ErrServiceDown = errors.New("service unavailable")
)
service := tts.NewOpenAI(apiKey)
reader, err := service.Synthesize(ctx, "Hello!", tts.DefaultSynthesisConfig())
if err != nil {
log.Fatal(err)
}
defer reader.Close()
data, _ := io.ReadAll(reader)
// Use audio data...
service := tts.NewCartesia(apiKey)
streamingService, ok := service.(tts.StreamingService)
if !ok {
log.Fatal("Provider doesn't support streaming")
}
chunks, err := streamingService.SynthesizeStream(ctx, "Hello world!", config)
if err != nil {
log.Fatal(err)
}
for chunk := range chunks {
if chunk.Error != nil {
log.Printf("Error: %v", chunk.Error)
break
}
playAudio(chunk.Data)
}
config := tts.SynthesisConfig{
Voice: "onyx",
Format: tts.FormatOpus,
Speed: 0.9, // Slightly slower
Pitch: -2, // Slightly lower
Language: "en-US",
Model: "tts-1-hd",
}
reader, _ := service.Synthesize(ctx, text, config)