Skip to content

Google Synthesizer

Google Gemini TTS synthesizer.

google

Google Gemini TTS synthesizer.

GoogleSynthesizer

GoogleSynthesizer(*, voice: str = 'Kore', model: str = 'gemini-2.5-flash-preview-tts', api_key: str | None = None, vertexai: bool = False, project: str | None = None, location: str | None = None, audio_format: Literal['wav', 'mp3', 'pcm', 'ogg'] = 'wav', sample_rate: int = 24000)

Synthesizes audio from text using Google Gemini's TTS.

Supports two auth modes:

  1. Google AI API (api_key): synth = GoogleSynthesizer(api_key="AIza...")

  2. Vertex AI (ADC / GOOGLE_APPLICATION_CREDENTIALS): synth = GoogleSynthesizer(vertexai=True, project="my-proj", location="us-central1")

Source code in src/russo/synthesizers/google.py
def __init__(
    self,
    *,
    voice: str = "Kore",
    model: str = "gemini-2.5-flash-preview-tts",
    api_key: str | None = None,
    vertexai: bool = False,
    project: str | None = None,
    location: str | None = None,
    audio_format: Literal["wav", "mp3", "pcm", "ogg"] = "wav",
    sample_rate: int = 24000,
) -> None:
    self.voice = voice
    self.model = model
    self.audio_format = audio_format
    self.sample_rate = sample_rate

    if api_key:
        self._client = genai.Client(api_key=api_key)
    elif vertexai:
        self._client = genai.Client(
            vertexai=True,
            project=project,
            location=location or "us-central1",
        )
    else:
        # Auto-detect: API key first, then Vertex when both project AND location set
        _api_key = os.environ.get("GOOGLE_API_KEY")
        _project = project or os.environ.get("GOOGLE_CLOUD_PROJECT") or os.environ.get("GOOGLE_PROJECT_ID")
        _location = location or os.environ.get("GOOGLE_CLOUD_LOCATION")
        if _api_key:
            self._client = genai.Client(api_key=_api_key)
        elif _project and _location:
            # google-auth does not expand ~ in credential paths
            creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
            if creds:
                os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.expanduser(creds)
            self._client = genai.Client(vertexai=True, project=_project, location=_location)
        else:
            # Fall back: let the SDK resolve from env (GOOGLE_API_KEY, ADC, etc.)
            self._client = genai.Client()

synthesize async

synthesize(text: str) -> Audio

Convert text to audio using Gemini TTS.

Source code in src/russo/synthesizers/google.py
async def synthesize(self, text: str) -> Audio:
    """Convert text to audio using Gemini TTS."""
    # Use explicit Content with text Part so the backend treats this as text-to-speech
    # (raw string can be interpreted as non-audio request and trigger 1007).
    contents = types.Content(
        role="user",
        parts=[types.Part.from_text(text=text)],
    )
    response = await self._client.aio.models.generate_content(
        model=self.model,
        contents=contents,
        config=types.GenerateContentConfig(
            response_modalities=["AUDIO"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self.voice),
                ),
            ),
        ),
    )
    audio_data = b""
    if response.candidates:
        for part in response.candidates[0].content.parts:
            if part.inline_data and part.inline_data.data:
                audio_data += part.inline_data.data

    return Audio(
        data=audio_data,
        format=self.audio_format,
        sample_rate=self.sample_rate,
    )