Merge pull request #795 from pipecat-ai/vp-nvidia-riva

[WIP] add nvidia riva
pipecat-ai · Dec 10, 2024 · 1db6245 · 1db6245
2 parents 6ba2dea + a49b4e4
commit 1db6245
Show file tree

Hide file tree

Showing 6 changed files with 421 additions and 6 deletions.
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,5 +1,5 @@
 build~=1.2.1
-grpcio-tools~=1.62.2
+grpcio-tools~=1.65.4
 pip-tools~=7.4.1
 pyright~=1.1.376
 pytest~=8.3.2

diff --git a/examples/foundational/01c-fastpitch.py b/examples/foundational/01c-fastpitch.py
@@ -0,0 +1,56 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import aiohttp
+import os
+import sys
+
+from pipecat.frames.frames import EndFrame, TTSSpeakFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.task import PipelineTask
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.services.riva import FastpitchTTSService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+
+from runner import configure
+
+from loguru import logger
+
+from dotenv import load_dotenv
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, _) = await configure(session)
+
+        transport = DailyTransport(
+            room_url, None, "Say One Thing", DailyParams(audio_out_enabled=True)
+        )
+
+        tts = FastpitchTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
+
+        runner = PipelineRunner()
+
+        task = PipelineTask(Pipeline([tts, transport.output()]))
+
+        # Register an event handler so we can play the audio when the
+        # participant joins.
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            participant_name = participant.get("info", {}).get("userName", "")
+            await task.queue_frames([TTSSpeakFrame(f"Aloha, {participant_name}!"), EndFrame()])
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/foundational/07r-interruptible-riva-nim.py b/examples/foundational/07r-interruptible-riva-nim.py
@@ -0,0 +1,92 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import os
+import sys
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from runner import configure
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.frames.frames import LLMMessagesFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.nim import NimLLMService
+from pipecat.services.riva import FastpitchTTSService, ParakeetSTTService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        (room_url, _) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            None,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+                vad_audio_passthrough=True,
+            ),
+        )
+
+        stt = ParakeetSTTService(api_key=os.getenv("NVIDIA_API_KEY"))
+
+        llm = NimLLMService(
+            api_key=os.getenv("NVIDIA_API_KEY"), model="meta/llama-3.1-405b-instruct"
+        )
+
+        tts = FastpitchTTSService(api_key=os.getenv("NVIDIA_API_KEY"))
+
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+
+        context = OpenAILLMContext(messages)
+        context_aggregator = llm.create_context_aggregator(context)
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                stt,  # STT
+                context_aggregator.user(),  # User responses
+                llm,  # LLM
+                tts,  # TTS
+                transport.output(),  # Transport bot output
+                context_aggregator.assistant(),  # Assistant spoken responses
+            ]
+        )
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            # Kick off the conversation.
+            messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+            await task.queue_frames([LLMMessagesFrame(messages)])
+
+        runner = PipelineRunner()
+
+        await runner.run(task)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "Markdown~=3.7",
     "numpy~=1.26.4",
     "Pillow~=10.4.0",
-    "protobuf~=4.25.4",
+    "protobuf~=5.26.1",
     "pydantic~=2.8.2",
     "pyloudnorm~=0.1.1",
     "resampy~=0.4.3",
@@ -48,7 +48,7 @@ elevenlabs = [ "websockets~=13.1" ]
 examples = [ "python-dotenv~=1.0.1", "flask~=3.0.3", "flask_cors~=4.0.1" ]
 fal = [ "fal-client~=0.4.1" ]
 gladia = [ "websockets~=13.1" ]
-google = [ "google-generativeai~=0.8.3", "google-cloud-texttospeech~=2.17.2" ]
+google = [ "google-generativeai~=0.8.3", "google-cloud-texttospeech~=2.21.1" ]
 grok = [ "openai~=1.50.2" ]
 groq = [ "openai~=1.50.2" ]
 gstreamer = [ "pygobject~=3.48.2" ]
@@ -63,7 +63,8 @@ nim = [ "openai~=1.50.2" ]
 noisereduce = [ "noisereduce~=3.0.3" ]
 openai = [ "openai~=1.50.2", "websockets~=13.1", "python-deepcompare~=1.0.1" ]
 openpipe = [ "openpipe~=4.38.0" ]
-playht = [ "pyht~=0.1.4", "websockets~=13.1" ]
+playht = [ "pyht~=0.1.8", "websockets~=13.1" ]
+riva = [ "nvidia-riva-client~=2.17.0" ]
 silero = [ "onnxruntime~=1.19.2" ]
 soundfile = [ "soundfile~=0.12.1" ]
 together = [ "openai~=1.50.2" ]