PlayHTHttpTTSService fixes

pipecat-ai · Jan 4, 2025 · d6d50dc · d6d50dc
1 parent 386ba61
commit d6d50dc
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,22 @@ All notable changes to **Pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased] - TBD
+
+### Added
+
+- Added a new foundational example `07e-interruptible-playht-http.py` for easy
+  testing of `PlayHTHttpTTSService`.
+
+### Changed
+
+- Changed the default model for `PlayHTHttpTTSService` to `Play3.0-mini-http`.
+
+### Fixed
+
+- Fixed an import issue for `PlayHTHttpTTSService`. Also removed language and
+  seed which are not supported `TTSOptions` by the PlayHT python client.
+
 ## [0.0.52] - 2024-12-24
 
 ### Added

diff --git a/examples/foundational/07e-interruptible-playht-http.py b/examples/foundational/07e-interruptible-playht-http.py
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import os
+import sys
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.openai import OpenAILLMService
+from pipecat.services.playht import PlayHTHttpTTSService
+from pipecat.transcriptions.language import Language
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+            ),
+        )
+
+        tts = PlayHTHttpTTSService(
+            user_id=os.getenv("PLAYHT_USER_ID"),
+            api_key=os.getenv("PLAYHT_API_KEY"),
+            voice_url="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        context = OpenAILLMContext(messages)
+        context_aggregator = llm.create_context_aggregator(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                context_aggregator.user(),  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                context_aggregator.assistant(),  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            PipelineParams(
+                allow_interruptions=True,
+                enable_metrics=True,
+                enable_usage_metrics=True,
+                report_only_initial_ttfb=True,
+            ),
+        )
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            await transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py
@@ -37,8 +37,7 @@
 
 try:
     from pyht.async_client import AsyncClient
-    from pyht.client import TTSOptions
-    from pyht.protos.api_pb2 import Format
+    from pyht.client import Format, TTSOptions
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
@@ -353,17 +352,15 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
 
 class PlayHTHttpTTSService(TTSService):
     class InputParams(BaseModel):
-        language: Optional[Language] = Language.EN
         speed: Optional[float] = 1.0
-        seed: Optional[int] = None
 
     def __init__(
         self,
         *,
         api_key: str,
         user_id: str,
         voice_url: str,
-        voice_engine: str = "Play3.0-mini",
+        voice_engine: str = "Play3.0-mini-http",  # Options: Play3.0-mini-ws, Play3.0-mini-http, Play3.0-mini-grpc
         sample_rate: int = 24000,
         params: InputParams = InputParams(),
         **kwargs,
@@ -379,31 +376,22 @@ def __init__(
         )
         self._settings = {
             "sample_rate": sample_rate,
-            "language": self.language_to_service_language(params.language)
-            if params.language
-            else "english",
             "format": Format.FORMAT_WAV,
             "voice_engine": voice_engine,
             "speed": params.speed,
-            "seed": params.seed,
         }
         self.set_model_name(voice_engine)
         self.set_voice(voice_url)
         self._options = TTSOptions(
             voice=self._voice_id,
-            language=self._settings["language"],
             sample_rate=self._settings["sample_rate"],
             format=self._settings["format"],
             speed=self._settings["speed"],
-            seed=self._settings["seed"],
         )
 
     def can_generate_metrics(self) -> bool:
         return True
 
-    def language_to_service_language(self, language: Language) -> str | None:
-        return language_to_playht_language(language)
-
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         logger.debug(f"Generating TTS: [{text}]")