From 5cbd71978070baae278e8150e61c4f64838c053a Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Mon, 6 Jan 2025 18:05:31 +0300
Subject: [PATCH 1/9] feat(gemini): add text handling to GeminiMultimodalLive

- Introduce text attribute in Part class for handling string data.
- Incorporate text processing in GeminiMultimodalLiveLLMService to push TextFrame if text is present.
---
 src/pipecat/services/gemini_multimodal_live/events.py | 1 +
 src/pipecat/services/gemini_multimodal_live/gemini.py | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/src/pipecat/services/gemini_multimodal_live/events.py b/src/pipecat/services/gemini_multimodal_live/events.py
index 0d5bc802f..36541aa30 100644
--- a/src/pipecat/services/gemini_multimodal_live/events.py
+++ b/src/pipecat/services/gemini_multimodal_live/events.py
@@ -105,6 +105,7 @@ class InlineData(BaseModel):
 
 class Part(BaseModel):
     inlineData: Optional[InlineData] = None
+    text: Optional[str] = None
 
 
 class ModelTurn(BaseModel):
diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py
index dd4375486..dea63b3c8 100644
--- a/src/pipecat/services/gemini_multimodal_live/gemini.py
+++ b/src/pipecat/services/gemini_multimodal_live/gemini.py
@@ -604,6 +604,11 @@ async def _handle_evt_model_turn(self, evt):
         part = evt.serverContent.modelTurn.parts[0]
         if not part:
             return
+
+        text = part.text
+        if text:
+            await self.push_frame(TextFrame(text=text))
+
         inline_data = part.inlineData
         if not inline_data:
             return

From c72c3025f6608a8d7785415bf86d79e32a7288d5 Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Mon, 6 Jan 2025 18:07:59 +0300
Subject: [PATCH 2/9] feat(gemini): add configuration methods for response
 modalities

- Introduce `set_model_only_audio` and `set_model_only_text` methods
  to toggle between audio-only and text-only response modes in
  `GeminiMultimodalLiveLLMService`.
- Refactor configuration setup to a class attribute for improved
  reusability and maintenance.
- Remove redundant configuration instantiation in the WebSocket
  connection setup process.
---
 .../services/gemini_multimodal_live/gemini.py | 59 +++++++++++--------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py
index dea63b3c8..687c63c0b 100644
--- a/src/pipecat/services/gemini_multimodal_live/gemini.py
+++ b/src/pipecat/services/gemini_multimodal_live/gemini.py
@@ -199,6 +199,28 @@ def __init__(
             "extra": params.extra if isinstance(params.extra, dict) else {},
         }
 
+        self.config = events.Config.model_validate(
+            {
+                "setup": {
+                    "model": self._model_name,
+                    "generation_config": {
+                        "frequency_penalty": self._settings["frequency_penalty"],
+                        "max_output_tokens": self._settings["max_tokens"],  # Not supported yet
+                        "presence_penalty": self._settings["presence_penalty"],
+                        "temperature": self._settings["temperature"],
+                        "top_k": self._settings["top_k"],
+                        "top_p": self._settings["top_p"],
+                        "response_modalities": ["AUDIO"],
+                        "speech_config": {
+                            "voice_config": {
+                                "prebuilt_voice_config": {"voice_name": self._voice_id}
+                            },
+                        },
+                    },
+                },
+            }
+        )
+
     def can_generate_metrics(self) -> bool:
         return True
 
@@ -208,6 +230,16 @@ def set_audio_input_paused(self, paused: bool):
     def set_video_input_paused(self, paused: bool):
         self._video_input_paused = paused
 
+    def set_model_only_audio(self):
+        self.config.setup.generation_config["response_modalities"] = ["AUDIO"]
+        self.config.setup.generation_config["speech_config"] = {
+            "voice_config": {"prebuilt_voice_config": {"voice_name": self._voice_id}}
+        }
+
+    def set_model_only_text(self):
+        self.config.setup.generation_config["response_modalities"] = ["TEXT"]
+        self.config.setup.generation_config["speech_config"] = None
+
     async def set_context(self, context: OpenAILLMContext):
         """Set the context explicitly from outside the pipeline.
 
@@ -372,39 +404,18 @@ async def _connect(self):
             logger.info(f"Connecting to {uri}")
             self._websocket = await websockets.connect(uri=uri)
             self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
-            config = events.Config.model_validate(
-                {
-                    "setup": {
-                        "model": self._model_name,
-                        "generation_config": {
-                            "frequency_penalty": self._settings["frequency_penalty"],
-                            "max_output_tokens": self._settings["max_tokens"],  # Not supported yet
-                            "presence_penalty": self._settings["presence_penalty"],
-                            "temperature": self._settings["temperature"],
-                            "top_k": self._settings["top_k"],
-                            "top_p": self._settings["top_p"],
-                            "response_modalities": ["AUDIO"],
-                            "speech_config": {
-                                "voice_config": {
-                                    "prebuilt_voice_config": {"voice_name": self._voice_id}
-                                },
-                            },
-                        },
-                    },
-                }
-            )
 
             system_instruction = self._system_instruction or ""
             if self._context and hasattr(self._context, "extract_system_instructions"):
                 system_instruction += "\n" + self._context.extract_system_instructions()
             if system_instruction:
                 logger.debug(f"Setting system instruction: {system_instruction}")
-                config.setup.system_instruction = events.SystemInstruction(
+                self.config.setup.system_instruction = events.SystemInstruction(
                     parts=[events.ContentPart(text=system_instruction)]
                 )
             if self._tools:
-                config.setup.tools = self._tools
-            await self.send_client_event(config)
+                self.config.setup.tools = self._tools
+            await self.send_client_event(self.config)
 
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")

From cdb909958c33c7e42567b305aabae4c592676da3 Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Tue, 7 Jan 2025 13:44:24 +0300
Subject: [PATCH 3/9] feat(examples): add Gemini multimodal live text example

Introduce a new example `26d-gemini-multimodal-live-text.py` to
demonstrate the use of GeminiMultimodalLiveLLMService with text-only
responses. This example sets up a pipeline for audio input via DailyTransport,
processing with Gemini, and output via Cartesia TTS.
---
 .../26d-gemini-multimodal-live-text.py        | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 examples/foundational/26d-gemini-multimodal-live-text.py

diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py
new file mode 100644
index 000000000..5e43e19bb
--- /dev/null
+++ b/examples/foundational/26d-gemini-multimodal-live-text.py
@@ -0,0 +1,85 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import os
+import sys
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
+from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_in_sample_rate=16000,
+                audio_out_sample_rate=24000,
+                audio_out_enabled=True,
+                vad_enabled=True,
+                vad_audio_passthrough=True,
+                # set stop_secs to something roughly similar to the internal setting
+                # of the Multimodal Live api, just to align events. This doesn't really
+                # matter because we can only use the Multimodal Live API's phrase
+                # endpointing, for now.
+                vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
+            ),
+        )
+
+        llm = GeminiMultimodalLiveLLMService(
+            api_key=os.getenv("GOOGLE_API_KEY"),
+            # system_instruction="Talk like a pirate."
+        )
+        llm.set_model_only_text()  # This forces model to produce text only responses
+
+        tts = CartesiaMultiLingualTTSService(api_key=os.getenv("CARTESIA_API_KEY"))
+
+        pipeline = Pipeline(
+            [
+                transport.input(),
+                llm,
+                tts,
+                transport.output(),
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            PipelineParams(
+                allow_interruptions=True,
+                enable_metrics=True,
+                enable_usage_metrics=True,
+            ),
+        )
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 12ae980abe2783124e4f91e791e634bb5f148cab Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Wed, 8 Jan 2025 16:30:56 +0300
Subject: [PATCH 4/9] feat(gemini): handle full text response in
 GeminiMultimodalLive

- Add a buffer to store bot text responses.
- Push a `LLMFullResponseStartFrame` when text begins.
- Clear the text buffer and send `LLMFullResponseEndFrame` after processing.
---
 src/pipecat/services/gemini_multimodal_live/gemini.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py
index 687c63c0b..ea28a2337 100644
--- a/src/pipecat/services/gemini_multimodal_live/gemini.py
+++ b/src/pipecat/services/gemini_multimodal_live/gemini.py
@@ -188,6 +188,7 @@ def __init__(
         self._bot_is_speaking = False
         self._user_audio_buffer = bytearray()
         self._bot_audio_buffer = bytearray()
+        self._bot_text_buffer = ""
 
         self._settings = {
             "frequency_penalty": params.frequency_penalty,
@@ -618,6 +619,10 @@ async def _handle_evt_model_turn(self, evt):
 
         text = part.text
         if text:
+            if not self._bot_text_buffer:
+                await self.push_frame(LLMFullResponseStartFrame())
+
+            self._bot_text_buffer += text
             await self.push_frame(TextFrame(text=text))
 
         inline_data = part.inlineData
@@ -660,9 +665,15 @@ async def _handle_evt_tool_call(self, evt):
     async def _handle_evt_turn_complete(self, evt):
         self._bot_is_speaking = False
         audio = self._bot_audio_buffer
+        text = self._bot_text_buffer
         self._bot_audio_buffer = bytearray()
+        self._bot_text_buffer = ""
+
         if audio and self._transcribe_model_audio and self._context:
             asyncio.create_task(self._handle_transcribe_model_audio(audio, self._context))
+        elif text:
+            await self.push_frame(LLMFullResponseEndFrame())
+
         await self.push_frame(TTSStoppedFrame())
 
     def create_context_aggregator(

From b42d3a82578365bee582aa4117b5c15b0443c8cf Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Wed, 8 Jan 2025 17:38:53 +0300
Subject: [PATCH 5/9] feat(gemini): add modality configuration for
 GeminiMultimodalLive

- Introduce `GeminiMultimodalModalities` enum for modality options.
- Add modality field to `InputParams`, defaulting to text.
- Simplify modality setup with `set_model_modalities` method.
- Refactor WebSocket configuration to support dynamic response modalities.
---
 .../services/gemini_multimodal_live/gemini.py | 70 ++++++++++---------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py
index ea28a2337..761d1224d 100644
--- a/src/pipecat/services/gemini_multimodal_live/gemini.py
+++ b/src/pipecat/services/gemini_multimodal_live/gemini.py
@@ -6,6 +6,7 @@
 
 import asyncio
 import base64
+from enum import Enum
 import json
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
@@ -132,6 +133,11 @@ def assistant(self) -> GeminiMultimodalLiveAssistantContextAggregator:
         return self._assistant
 
 
+class GeminiMultimodalModalities(Enum):
+    TEXT = "TEXT"
+    AUDIO = "AUDIO"
+
+
 class InputParams(BaseModel):
     frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     max_tokens: Optional[int] = Field(default=4096, ge=1)
@@ -139,6 +145,9 @@ class InputParams(BaseModel):
     temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     top_k: Optional[int] = Field(default=None, ge=0)
     top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
+    modalities: Optional[GeminiMultimodalModalities] = Field(
+        default=GeminiMultimodalModalities.TEXT
+    )
     extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
 
 
@@ -197,31 +206,10 @@ def __init__(
             "temperature": params.temperature,
             "top_k": params.top_k,
             "top_p": params.top_p,
+            "modalities": params.modalities,
             "extra": params.extra if isinstance(params.extra, dict) else {},
         }
 
-        self.config = events.Config.model_validate(
-            {
-                "setup": {
-                    "model": self._model_name,
-                    "generation_config": {
-                        "frequency_penalty": self._settings["frequency_penalty"],
-                        "max_output_tokens": self._settings["max_tokens"],  # Not supported yet
-                        "presence_penalty": self._settings["presence_penalty"],
-                        "temperature": self._settings["temperature"],
-                        "top_k": self._settings["top_k"],
-                        "top_p": self._settings["top_p"],
-                        "response_modalities": ["AUDIO"],
-                        "speech_config": {
-                            "voice_config": {
-                                "prebuilt_voice_config": {"voice_name": self._voice_id}
-                            },
-                        },
-                    },
-                },
-            }
-        )
-
     def can_generate_metrics(self) -> bool:
         return True
 
@@ -231,15 +219,8 @@ def set_audio_input_paused(self, paused: bool):
     def set_video_input_paused(self, paused: bool):
         self._video_input_paused = paused
 
-    def set_model_only_audio(self):
-        self.config.setup.generation_config["response_modalities"] = ["AUDIO"]
-        self.config.setup.generation_config["speech_config"] = {
-            "voice_config": {"prebuilt_voice_config": {"voice_name": self._voice_id}}
-        }
-
-    def set_model_only_text(self):
-        self.config.setup.generation_config["response_modalities"] = ["TEXT"]
-        self.config.setup.generation_config["speech_config"] = None
+    def set_model_modalities(self, modalities: GeminiMultimodalModalities):
+        self._settings["modalities"] = modalities
 
     async def set_context(self, context: OpenAILLMContext):
         """Set the context explicitly from outside the pipeline.
@@ -401,6 +382,27 @@ async def _connect(self):
                 # handle disconnections in the send/recv code paths.
                 return
 
+            config = events.Config.model_validate(
+                {
+                    "setup": {
+                        "model": self._model_name,
+                        "generation_config": {
+                            "frequency_penalty": self._settings["frequency_penalty"],
+                            "max_output_tokens": self._settings["max_tokens"],  # Not supported yet
+                            "presence_penalty": self._settings["presence_penalty"],
+                            "temperature": self._settings["temperature"],
+                            "top_k": self._settings["top_k"],
+                            "top_p": self._settings["top_p"],
+                            "response_modalities": self._settings["modalities"].value,
+                            "speech_config": {
+                                "voice_config": {
+                                    "prebuilt_voice_config": {"voice_name": self._voice_id}
+                                },
+                            },
+                        },
+                    },
+                }
+            )
             uri = f"wss://{self.base_url}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
             logger.info(f"Connecting to {uri}")
             self._websocket = await websockets.connect(uri=uri)
@@ -411,12 +413,12 @@ async def _connect(self):
                 system_instruction += "\n" + self._context.extract_system_instructions()
             if system_instruction:
                 logger.debug(f"Setting system instruction: {system_instruction}")
-                self.config.setup.system_instruction = events.SystemInstruction(
+                config.setup.system_instruction = events.SystemInstruction(
                     parts=[events.ContentPart(text=system_instruction)]
                 )
             if self._tools:
-                self.config.setup.tools = self._tools
-            await self.send_client_event(self.config)
+                config.setup.tools = self._tools
+            await self.send_client_event(config)
 
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")

From 94a6f1086ecd347347bcf16a0f2d5e8ae0bae62d Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Wed, 8 Jan 2025 18:43:54 +0300
Subject: [PATCH 6/9] feat(gemini): change default modality to AUDIO

Modify the default modality in the `InputParams` class from TEXT to AUDIO
to better align with the intended use case for GeminiMultimodalLive
service.
---
 src/pipecat/services/gemini_multimodal_live/gemini.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py
index 761d1224d..1387ffac8 100644
--- a/src/pipecat/services/gemini_multimodal_live/gemini.py
+++ b/src/pipecat/services/gemini_multimodal_live/gemini.py
@@ -146,7 +146,7 @@ class InputParams(BaseModel):
     top_k: Optional[int] = Field(default=None, ge=0)
     top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
     modalities: Optional[GeminiMultimodalModalities] = Field(
-        default=GeminiMultimodalModalities.TEXT
+        default=GeminiMultimodalModalities.AUDIO
     )
     extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
 

From a729834482bc7cc34a58094bc882ab0c93ae1272 Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Wed, 8 Jan 2025 18:53:21 +0300
Subject: [PATCH 7/9] refactor(gemini): reposition WebSocket connection code

Move WebSocket connection setup earlier in the function for better
organization and to prepare for subsequent configuration steps.
---
 src/pipecat/services/gemini_multimodal_live/gemini.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py
index 1387ffac8..a94399a1d 100644
--- a/src/pipecat/services/gemini_multimodal_live/gemini.py
+++ b/src/pipecat/services/gemini_multimodal_live/gemini.py
@@ -382,6 +382,10 @@ async def _connect(self):
                 # handle disconnections in the send/recv code paths.
                 return
 
+            uri = f"wss://{self.base_url}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
+            logger.info(f"Connecting to {uri}")
+            self._websocket = await websockets.connect(uri=uri)
+            self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
             config = events.Config.model_validate(
                 {
                     "setup": {
@@ -403,10 +407,6 @@ async def _connect(self):
                     },
                 }
             )
-            uri = f"wss://{self.base_url}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
-            logger.info(f"Connecting to {uri}")
-            self._websocket = await websockets.connect(uri=uri)
-            self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
 
             system_instruction = self._system_instruction or ""
             if self._context and hasattr(self._context, "extract_system_instructions"):

From a342fe732eb5ce5f2d0f19cbd3731e0cfab3473d Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Wed, 8 Jan 2025 19:34:04 +0300
Subject: [PATCH 8/9] docs: update CHANGELOG with Gemini modalities and
 examples

---
 CHANGELOG.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6adf6a994..eaccd2d26 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,13 +16,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `enable_prejoin_ui`, `max_participants` and `start_video_off` params
   to `DailyRoomProperties`.
 - Added `session_timeout` to `FastAPIWebsocketTransport` and `WebsocketServerTransport`
-  for configuring session timeouts (in seconds). Triggers `on_session_timeout` for custom timeout handling. 
+  for configuring session timeouts (in seconds). Triggers `on_session_timeout` for custom timeout handling.
   See [examples/websocket-server/bot.py](https://github.com/pipecat-ai/pipecat/blob/main/examples/websocket-server/bot.py).
+- Added the new modalities option and helper function to set Gemini output modalities.
+- Added `examples/foundational/26d-gemini-multimodal-live-text.py` which is using Gemini as TEXT modality and using another TTS provider for TTS process.
 
 ### Changed
 
 - api_key, aws_access_key_id and region are no longer required parameters for the PollyTTSService (AWSTTSService)
 - Added `session_timeout` example in `examples/websocket-server/bot.py` to handle session timeout event.
+- Changed `InputParams` in `src/pipecat/services/gemini_multimodal_live/gemini.py` to support different modalities.
 
 ### Fixed
 

From 40e9ee6d63d3ece391d2abb8996359a75eddfcaf Mon Sep 17 00:00:00 2001
From: Mert Sefa AKGUN <git@msakg.com>
Date: Wed, 8 Jan 2025 21:14:29 +0300
Subject: [PATCH 9/9] fix(examples): correct import order in Gemini example

- Move `CartesiaMultiLingualTTSService` import to maintain proper order.
- Reorganize `enum` import to adhere to styling standards.
---
 examples/foundational/26d-gemini-multimodal-live-text.py | 2 +-
 src/pipecat/services/gemini_multimodal_live/gemini.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/foundational/26d-gemini-multimodal-live-text.py b/examples/foundational/26d-gemini-multimodal-live-text.py
index 5e43e19bb..760af39ce 100644
--- a/examples/foundational/26d-gemini-multimodal-live-text.py
+++ b/examples/foundational/26d-gemini-multimodal-live-text.py
@@ -9,6 +9,7 @@
 import sys
 
 import aiohttp
+from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService
 from dotenv import load_dotenv
 from loguru import logger
 from runner import configure
@@ -19,7 +20,6 @@
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.services.gemini_multimodal_live.gemini import GeminiMultimodalLiveLLMService
-from agent.services.tts.cartesia_multilingual import CartesiaMultiLingualTTSService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 
 load_dotenv(override=True)
diff --git a/src/pipecat/services/gemini_multimodal_live/gemini.py b/src/pipecat/services/gemini_multimodal_live/gemini.py
index a94399a1d..1d76f191c 100644
--- a/src/pipecat/services/gemini_multimodal_live/gemini.py
+++ b/src/pipecat/services/gemini_multimodal_live/gemini.py
@@ -6,9 +6,9 @@
 
 import asyncio
 import base64
-from enum import Enum
 import json
 from dataclasses import dataclass
+from enum import Enum
 from typing import Any, Dict, List, Optional
 
 import websockets