Skip to content

Commit

Permalink
feat(examples): add Gemini multimodal live text example
Browse files Browse the repository at this point in the history
Introduce a new example `26d-gemini-multimodal-live-text.py` to
demonstrate the use of GeminiMultimodalLiveLLMService with text-only
responses. This example sets up a pipeline for audio input via DailyTransport,
processing with Gemini, and output via Cartesia TTS.
  • Loading branch information
imsakg committed Jan 7, 2025
1 parent d938ede commit eb9ec29
Showing 1 changed file with 85 additions and 0 deletions.
85 changes: 85 additions & 0 deletions examples/foundational/26d-gemini-multimodal-live-text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
import os
import sys

import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure

from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService
from pipecat.transports.services.daily import DailyParams, DailyTransport

load_dotenv(override=True)

logger.remove(0)
logger.add(sys.stderr, level="DEBUG")


async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)

transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_in_sample_rate=16000,
audio_out_sample_rate=24000,
audio_out_enabled=True,
vad_enabled=True,
vad_audio_passthrough=True,
# set stop_secs to something roughly similar to the internal setting
# of the Multimodal Live api, just to align events. This doesn't really
# matter because we can only use the Multimodal Live API's phrase
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
)

llm = GeminiMultimodalLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
# system_instruction="Talk like a pirate."
)
llm.set_model_only_text() # This forces model to produce text only responses

tts = CartesiaMultiLingualTTSService(api_key=os.getenv("CARTESIA_API_KEY"))

pipeline = Pipeline(
[
transport.input(),
llm,
tts,
transport.output(),
]
)

task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
),
)

runner = PipelineRunner()

await runner.run(task)


if __name__ == "__main__":
asyncio.run(main())

0 comments on commit eb9ec29

Please sign in to comment.