PlayHTHttpTTSService fixes

pipecat-ai · Jan 10, 2025 · 86516d2 · 86516d2
1 parent 5cd9dab
commit 86516d2
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added a new foundational example `07e-interruptible-playht-http.py` for easy
+  testing of `PlayHTHttpTTSService`.
+
 - Added support for Google TTS Journey voices in `GoogleTTSService`.
 
 - Added `29-livekit-audio-chat.py`, as a new foundational examples for
@@ -27,12 +30,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- Changed the default model for `PlayHTHttpTTSService` to `Play3.0-mini-http`.
+
 - api_key, aws_access_key_id and region are no longer required parameters for the PollyTTSService (AWSTTSService)
+
 - Added `session_timeout` example in `examples/websocket-server/bot.py` to handle session timeout event.
+
 - Changed `InputParams` in `src/pipecat/services/gemini_multimodal_live/gemini.py` to support different modalities.
 
 ### Fixed
 
+- Fixed an import issue for `PlayHTHttpTTSService`.
+
+- Fixed an issue where languages couldn't be used with the `PlayHTHttpTTSService`.
+
 - Fixed an issue where `OpenAIRealtimeBetaLLMService` audio chunks were hitting
   an error when truncating audio content.
 

diff --git a/examples/foundational/07e-interruptible-playht-http.py b/examples/foundational/07e-interruptible-playht-http.py
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import os
+import sys
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.openai import OpenAILLMService
+from pipecat.services.playht import PlayHTHttpTTSService
+from pipecat.transcriptions.language import Language
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+            ),
+        )
+
+        tts = PlayHTHttpTTSService(
+            user_id=os.getenv("PLAYHT_USER_ID"),
+            api_key=os.getenv("PLAYHT_API_KEY"),
+            voice_url="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
+        )
+
+        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        context = OpenAILLMContext(messages)
+        context_aggregator = llm.create_context_aggregator(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                context_aggregator.user(),  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                context_aggregator.assistant(),  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(
+            pipeline,
+            PipelineParams(
+                allow_interruptions=True,
+                enable_metrics=True,
+                enable_usage_metrics=True,
+                report_only_initial_ttfb=True,
+            ),
+        )
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            await transport.capture_participant_transcription(participant["id"])
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/pipecat/services/playht.py b/src/pipecat/services/playht.py
@@ -37,8 +37,8 @@
 
 try:
     from pyht.async_client import AsyncClient
-    from pyht.client import TTSOptions
-    from pyht.protos.api_pb2 import Format
+    from pyht.client import Format, TTSOptions
+    from pyht.client import Language as PlayHTLanguage
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error(
@@ -363,7 +363,7 @@ def __init__(
         api_key: str,
         user_id: str,
         voice_url: str,
-        voice_engine: str = "Play3.0-mini",
+        voice_engine: str = "Play3.0-mini-http",  # Options: Play3.0-mini-http, Play3.0-mini-ws
         sample_rate: int = 24000,
         params: InputParams = InputParams(),
         **kwargs,
@@ -389,9 +389,19 @@ def __init__(
         }
         self.set_model_name(voice_engine)
         self.set_voice(voice_url)
+
+        language_str = self._settings["language"]
+        playht_language = None
+        if language_str:
+            # Convert string to PlayHT Language enum
+            for lang in PlayHTLanguage:
+                if lang.value == language_str:
+                    playht_language = lang
+                    break
+
         self._options = TTSOptions(
             voice=self._voice_id,
-            language=self._settings["language"],
+            language=playht_language,
             sample_rate=self._settings["sample_rate"],
             format=self._settings["format"],
             speed=self._settings["speed"],