From eb9ec292c47dabf97126ee40335fb156e8215439 Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Tue, 7 Jan 2025 13:44:24 +0300 Subject: [PATCH] feat(examples): add Gemini multimodal live text example Introduce a new example `26d-gemini-multimodal-live-text.py` to demonstrate the use of GeminiMultimodalLiveLLMService with text-only responses. This example sets up a pipeline for audio input via DailyTransport, processing with Gemini, and output via Cartesia TTS. --- .../26d-gemini-multimodal-live-text.py | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 examples/foundational/26d-gemini-multimodal-live-text.py diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py new file mode 100644 index 000000000..5e43e19bb --- /dev/null +++ b/examples/foundational/26d-gemini-multimodal-live-text.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService +from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService +from pipecat.transports.services.daily import DailyParams, DailyTransport + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_in_sample_rate=16000, + audio_out_sample_rate=24000, + audio_out_enabled=True, + vad_enabled=True, + vad_audio_passthrough=True, + # set stop_secs to something roughly similar to the internal setting + # of the Multimodal Live api, just to align events. This doesn't really + # matter because we can only use the Multimodal Live API's phrase + # endpointing, for now. + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), + ), + ) + + llm = GeminiMultimodalLiveLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + # system_instruction="Talk like a pirate." + ) + llm.set_model_only_text() # This forces model to produce text only responses + + tts = CartesiaMultiLingualTTSService(api_key=os.getenv("CARTESIA_API_KEY")) + + pipeline = Pipeline( + [ + transport.input(), + llm, + tts, + transport.output(), + ] + ) + + task = PipelineTask( + pipeline, + PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + ), + ) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main())