From 5cbd71978070baae278e8150e61c4f64838c053a Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Mon, 6 Jan 2025 18:05:31 +0300 Subject: [PATCH 1/9] feat(gemini): add text handling to GeminiMultimodalLive - Introduce text attribute in Part class for handling string data. - Incorporate text processing in GeminiMultimodalLiveLLMService to push TextFrame if text is present. --- src/pipecat/services/gemini_multimodal_live/events.py | 1 + src/pipecat/services/gemini_multimodal_live/gemini.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/src/pipecat/services/gemini_multimodal_live/events.py b/src/pipecat/services/gemini_multimodal_live/events.py index 0d5bc802f..36541aa30 100644 --- a/src/pipecat/services/gemini_multimodal_live/events.py +++ b/src/pipecat/services/gemini_multimodal_live/events.py @@ -105,6 +105,7 @@ class InlineData(BaseModel): class Part(BaseModel): inlineData: Optional[InlineData] = None + text: Optional[str] = None class ModelTurn(BaseModel): diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index dd4375486..dea63b3c8 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -604,6 +604,11 @@ async def _handle_evt_model_turn(self, evt): part = evt.serverContent.modelTurn.parts[0] if not part: return + + text = part.text + if text: + await self.push_frame(TextFrame(text=text)) + inline_data = part.inlineData if not inline_data: return From c72c3025f6608a8d7785415bf86d79e32a7288d5 Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Mon, 6 Jan 2025 18:07:59 +0300 Subject: [PATCH 2/9] feat(gemini): add configuration methods for response modalities - Introduce `set_model_only_audio` and `set_model_only_text` methods to toggle between audio-only and text-only response modes in `GeminiMultimodalLiveLLMService`. - Refactor configuration setup to a class attribute for improved reusability and maintenance. - Remove redundant configuration instantiation in the WebSocket connection setup process. --- .../services/gemini_multimodal_live/gemini.py | 59 +++++++++++-------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index dea63b3c8..687c63c0b 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -199,6 +199,28 @@ def __init__( "extra": params.extra if isinstance(params.extra, dict) else {}, } + self.config = events.Config.model_validate( + { + "setup": { + "model": self._model_name, + "generation_config": { + "frequency_penalty": self._settings["frequency_penalty"], + "max_output_tokens": self._settings["max_tokens"], # Not supported yet + "presence_penalty": self._settings["presence_penalty"], + "temperature": self._settings["temperature"], + "top_k": self._settings["top_k"], + "top_p": self._settings["top_p"], + "response_modalities": ["AUDIO"], + "speech_config": { + "voice_config": { + "prebuilt_voice_config": {"voice_name": self._voice_id} + }, + }, + }, + }, + } + ) + def can_generate_metrics(self) -> bool: return True @@ -208,6 +230,16 @@ def set_audio_input_paused(self, paused: bool): def set_video_input_paused(self, paused: bool): self._video_input_paused = paused + def set_model_only_audio(self): + self.config.setup.generation_config["response_modalities"] = ["AUDIO"] + self.config.setup.generation_config["speech_config"] = { + "voice_config": {"prebuilt_voice_config": {"voice_name": self._voice_id}} + } + + def set_model_only_text(self): + self.config.setup.generation_config["response_modalities"] = ["TEXT"] + self.config.setup.generation_config["speech_config"] = None + async def set_context(self, context: OpenAILLMContext): """Set the context explicitly from outside the pipeline. @@ -372,39 +404,18 @@ async def _connect(self): logger.info(f"Connecting to {uri}") self._websocket = await websockets.connect(uri=uri) self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) - config = events.Config.model_validate( - { - "setup": { - "model": self._model_name, - "generation_config": { - "frequency_penalty": self._settings["frequency_penalty"], - "max_output_tokens": self._settings["max_tokens"], # Not supported yet - "presence_penalty": self._settings["presence_penalty"], - "temperature": self._settings["temperature"], - "top_k": self._settings["top_k"], - "top_p": self._settings["top_p"], - "response_modalities": ["AUDIO"], - "speech_config": { - "voice_config": { - "prebuilt_voice_config": {"voice_name": self._voice_id} - }, - }, - }, - }, - } - ) system_instruction = self._system_instruction or "" if self._context and hasattr(self._context, "extract_system_instructions"): system_instruction += "\n" + self._context.extract_system_instructions() if system_instruction: logger.debug(f"Setting system instruction: {system_instruction}") - config.setup.system_instruction = events.SystemInstruction( + self.config.setup.system_instruction = events.SystemInstruction( parts=[events.ContentPart(text=system_instruction)] ) if self._tools: - config.setup.tools = self._tools - await self.send_client_event(config) + self.config.setup.tools = self._tools + await self.send_client_event(self.config) except Exception as e: logger.error(f"{self} initialization error: {e}") From cdb909958c33c7e42567b305aabae4c592676da3 Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Tue, 7 Jan 2025 13:44:24 +0300 Subject: [PATCH 3/9] feat(examples): add Gemini multimodal live text example Introduce a new example `26d-gemini-multimodal-live-text.py` to demonstrate the use of GeminiMultimodalLiveLLMService with text-only responses. This example sets up a pipeline for audio input via DailyTransport, processing with Gemini, and output via Cartesia TTS. --- .../26d-gemini-multimodal-live-text.py | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 examples/foundational/26d-gemini-multimodal-live-text.py diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py new file mode 100644 index 000000000..5e43e19bb --- /dev/null +++ b/examples/foundational/26d-gemini-multimodal-live-text.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2024, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService +from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService +from pipecat.transports.services.daily import DailyParams, DailyTransport + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_in_sample_rate=16000, + audio_out_sample_rate=24000, + audio_out_enabled=True, + vad_enabled=True, + vad_audio_passthrough=True, + # set stop_secs to something roughly similar to the internal setting + # of the Multimodal Live api, just to align events. This doesn't really + # matter because we can only use the Multimodal Live API's phrase + # endpointing, for now. + vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)), + ), + ) + + llm = GeminiMultimodalLiveLLMService( + api_key=os.getenv("GOOGLE_API_KEY"), + # system_instruction="Talk like a pirate." + ) + llm.set_model_only_text() # This forces model to produce text only responses + + tts = CartesiaMultiLingualTTSService(api_key=os.getenv("CARTESIA_API_KEY")) + + pipeline = Pipeline( + [ + transport.input(), + llm, + tts, + transport.output(), + ] + ) + + task = PipelineTask( + pipeline, + PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + ), + ) + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) From 12ae980abe2783124e4f91e791e634bb5f148cab Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Wed, 8 Jan 2025 16:30:56 +0300 Subject: [PATCH 4/9] feat(gemini): handle full text response in GeminiMultimodalLive - Add a buffer to store bot text responses. - Push a `LLMFullResponseStartFrame` when text begins. - Clear the text buffer and send `LLMFullResponseEndFrame` after processing. --- src/pipecat/services/gemini_multimodal_live/gemini.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index 687c63c0b..ea28a2337 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -188,6 +188,7 @@ def __init__( self._bot_is_speaking = False self._user_audio_buffer = bytearray() self._bot_audio_buffer = bytearray() + self._bot_text_buffer = "" self._settings = { "frequency_penalty": params.frequency_penalty, @@ -618,6 +619,10 @@ async def _handle_evt_model_turn(self, evt): text = part.text if text: + if not self._bot_text_buffer: + await self.push_frame(LLMFullResponseStartFrame()) + + self._bot_text_buffer += text await self.push_frame(TextFrame(text=text)) inline_data = part.inlineData @@ -660,9 +665,15 @@ async def _handle_evt_tool_call(self, evt): async def _handle_evt_turn_complete(self, evt): self._bot_is_speaking = False audio = self._bot_audio_buffer + text = self._bot_text_buffer self._bot_audio_buffer = bytearray() + self._bot_text_buffer = "" + if audio and self._transcribe_model_audio and self._context: asyncio.create_task(self._handle_transcribe_model_audio(audio, self._context)) + elif text: + await self.push_frame(LLMFullResponseEndFrame()) + await self.push_frame(TTSStoppedFrame()) def create_context_aggregator( From b42d3a82578365bee582aa4117b5c15b0443c8cf Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Wed, 8 Jan 2025 17:38:53 +0300 Subject: [PATCH 5/9] feat(gemini): add modality configuration for GeminiMultimodalLive - Introduce `GeminiMultimodalModalities` enum for modality options. - Add modality field to `InputParams`, defaulting to text. - Simplify modality setup with `set_model_modalities` method. - Refactor WebSocket configuration to support dynamic response modalities. --- .../services/gemini_multimodal_live/gemini.py | 70 ++++++++++--------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index ea28a2337..761d1224d 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -6,6 +6,7 @@ import asyncio import base64 +from enum import Enum import json from dataclasses import dataclass from typing import Any, Dict, List, Optional @@ -132,6 +133,11 @@ def assistant(self) -> GeminiMultimodalLiveAssistantContextAggregator: return self._assistant +class GeminiMultimodalModalities(Enum): + TEXT = "TEXT" + AUDIO = "AUDIO" + + class InputParams(BaseModel): frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) max_tokens: Optional[int] = Field(default=4096, ge=1) @@ -139,6 +145,9 @@ class InputParams(BaseModel): temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_k: Optional[int] = Field(default=None, ge=0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + modalities: Optional[GeminiMultimodalModalities] = Field( + default=GeminiMultimodalModalities.TEXT + ) extra: Optional[Dict[str, Any]] = Field(default_factory=dict) @@ -197,31 +206,10 @@ def __init__( "temperature": params.temperature, "top_k": params.top_k, "top_p": params.top_p, + "modalities": params.modalities, "extra": params.extra if isinstance(params.extra, dict) else {}, } - self.config = events.Config.model_validate( - { - "setup": { - "model": self._model_name, - "generation_config": { - "frequency_penalty": self._settings["frequency_penalty"], - "max_output_tokens": self._settings["max_tokens"], # Not supported yet - "presence_penalty": self._settings["presence_penalty"], - "temperature": self._settings["temperature"], - "top_k": self._settings["top_k"], - "top_p": self._settings["top_p"], - "response_modalities": ["AUDIO"], - "speech_config": { - "voice_config": { - "prebuilt_voice_config": {"voice_name": self._voice_id} - }, - }, - }, - }, - } - ) - def can_generate_metrics(self) -> bool: return True @@ -231,15 +219,8 @@ def set_audio_input_paused(self, paused: bool): def set_video_input_paused(self, paused: bool): self._video_input_paused = paused - def set_model_only_audio(self): - self.config.setup.generation_config["response_modalities"] = ["AUDIO"] - self.config.setup.generation_config["speech_config"] = { - "voice_config": {"prebuilt_voice_config": {"voice_name": self._voice_id}} - } - - def set_model_only_text(self): - self.config.setup.generation_config["response_modalities"] = ["TEXT"] - self.config.setup.generation_config["speech_config"] = None + def set_model_modalities(self, modalities: GeminiMultimodalModalities): + self._settings["modalities"] = modalities async def set_context(self, context: OpenAILLMContext): """Set the context explicitly from outside the pipeline. @@ -401,6 +382,27 @@ async def _connect(self): # handle disconnections in the send/recv code paths. return + config = events.Config.model_validate( + { + "setup": { + "model": self._model_name, + "generation_config": { + "frequency_penalty": self._settings["frequency_penalty"], + "max_output_tokens": self._settings["max_tokens"], # Not supported yet + "presence_penalty": self._settings["presence_penalty"], + "temperature": self._settings["temperature"], + "top_k": self._settings["top_k"], + "top_p": self._settings["top_p"], + "response_modalities": self._settings["modalities"].value, + "speech_config": { + "voice_config": { + "prebuilt_voice_config": {"voice_name": self._voice_id} + }, + }, + }, + }, + } + ) uri = f"wss://{self.base_url}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" logger.info(f"Connecting to {uri}") self._websocket = await websockets.connect(uri=uri) @@ -411,12 +413,12 @@ async def _connect(self): system_instruction += "\n" + self._context.extract_system_instructions() if system_instruction: logger.debug(f"Setting system instruction: {system_instruction}") - self.config.setup.system_instruction = events.SystemInstruction( + config.setup.system_instruction = events.SystemInstruction( parts=[events.ContentPart(text=system_instruction)] ) if self._tools: - self.config.setup.tools = self._tools - await self.send_client_event(self.config) + config.setup.tools = self._tools + await self.send_client_event(config) except Exception as e: logger.error(f"{self} initialization error: {e}") From 94a6f1086ecd347347bcf16a0f2d5e8ae0bae62d Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Wed, 8 Jan 2025 18:43:54 +0300 Subject: [PATCH 6/9] feat(gemini): change default modality to AUDIO Modify the default modality in the `InputParams` class from TEXT to AUDIO to better align with the intended use case for GeminiMultimodalLive service. --- src/pipecat/services/gemini_multimodal_live/gemini.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index 761d1224d..1387ffac8 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -146,7 +146,7 @@ class InputParams(BaseModel): top_k: Optional[int] = Field(default=None, ge=0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) modalities: Optional[GeminiMultimodalModalities] = Field( - default=GeminiMultimodalModalities.TEXT + default=GeminiMultimodalModalities.AUDIO ) extra: Optional[Dict[str, Any]] = Field(default_factory=dict) From a729834482bc7cc34a58094bc882ab0c93ae1272 Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Wed, 8 Jan 2025 18:53:21 +0300 Subject: [PATCH 7/9] refactor(gemini): reposition WebSocket connection code Move WebSocket connection setup earlier in the function for better organization and to prepare for subsequent configuration steps. --- src/pipecat/services/gemini_multimodal_live/gemini.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index 1387ffac8..a94399a1d 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -382,6 +382,10 @@ async def _connect(self): # handle disconnections in the send/recv code paths. return + uri = f"wss://{self.base_url}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" + logger.info(f"Connecting to {uri}") + self._websocket = await websockets.connect(uri=uri) + self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) config = events.Config.model_validate( { "setup": { @@ -403,10 +407,6 @@ async def _connect(self): }, } ) - uri = f"wss://{self.base_url}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" - logger.info(f"Connecting to {uri}") - self._websocket = await websockets.connect(uri=uri) - self._receive_task = self.get_event_loop().create_task(self._receive_task_handler()) system_instruction = self._system_instruction or "" if self._context and hasattr(self._context, "extract_system_instructions"): From a342fe732eb5ce5f2d0f19cbd3731e0cfab3473d Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Wed, 8 Jan 2025 19:34:04 +0300 Subject: [PATCH 8/9] docs: update CHANGELOG with Gemini modalities and examples --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6adf6a994..eaccd2d26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,13 +16,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added `enable_prejoin_ui`, `max_participants` and `start_video_off` params to `DailyRoomProperties`. - Added `session_timeout` to `FastAPIWebsocketTransport` and `WebsocketServerTransport` - for configuring session timeouts (in seconds). Triggers `on_session_timeout` for custom timeout handling. + for configuring session timeouts (in seconds). Triggers `on_session_timeout` for custom timeout handling. See [examples/websocket-server/bot.py](https://github.com/pipecat-ai/pipecat/blob/main/examples/websocket-server/bot.py). +- Added the new modalities option and helper function to set Gemini output modalities. +- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is using Gemini as TEXT modality and using another TTS provider for TTS process. ### Changed - api_key, aws_access_key_id and region are no longer required parameters for the PollyTTSService (AWSTTSService) - Added `session_timeout` example in `examples/websocket-server/bot.py` to handle session timeout event. +- Changed `InputParams` in `src/pipecat/services/gemini_multimodal_live/gemini.py` to support different modalities. ### Fixed From 40e9ee6d63d3ece391d2abb8996359a75eddfcaf Mon Sep 17 00:00:00 2001 From: Mert Sefa AKGUN Date: Wed, 8 Jan 2025 21:14:29 +0300 Subject: [PATCH 9/9] fix(examples): correct import order in Gemini example - Move `CartesiaMultiLingualTTSService` import to maintain proper order. - Reorganize `enum` import to adhere to styling standards. --- examples/foundational/26d-gemini-multimodal-live-text.py | 2 +- src/pipecat/services/gemini_multimodal_live/gemini.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py index 5e43e19bb..760af39ce 100644 --- a/examples/foundational/26d-gemini-multimodal-live-text.py +++ b/examples/foundational/26d-gemini-multimodal-live-text.py @@ -9,6 +9,7 @@ import sys import aiohttp +from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService from dotenv import load_dotenv from loguru import logger from runner import configure @@ -19,7 +20,6 @@ from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService -from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService from pipecat.transports.services.daily import DailyParams, DailyTransport load_dotenv(override=True) diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py index a94399a1d..1d76f191c 100644 --- a/src/pipecat/services/gemini_multimodal_live/gemini.py +++ b/src/pipecat/services/gemini_multimodal_live/gemini.py @@ -6,9 +6,9 @@ import asyncio import base64 -from enum import Enum import json from dataclasses import dataclass +from enum import Enum from typing import Any, Dict, List, Optional import websockets