Merge pull request #256 from pipecat-ai/aleix/tts-cleanup-when-interr…

…upted services(tts): strip before TTS and cleanup when interrupted
pipecat-ai · Jun 25, 2024 · 83d1931 · 83d1931
2 parents 0ddc572 + c31f2ab
commit 83d1931
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed an issue that could mix new LLM responses with previous ones when
+  handling interruptions.
+
 - Fixed a Daily transport blocking situation that occurred while reading audio
   frames after a participant left the room. Needs daily-python >= 0.10.1.
 

diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py
@@ -16,7 +16,9 @@
     EndFrame,
     ErrorFrame,
     Frame,
+    LLMFullResponseStartFrame,
     StartFrame,
+    StartInterruptionFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
     TextFrame,
@@ -114,13 +116,17 @@ async def _process_text_frame(self, frame: TextFrame):
             if self._current_sentence.strip().endswith(
                     (".", "?", "!")) and not self._current_sentence.strip().endswith(
                     ("Mr,", "Mrs.", "Ms.", "Dr.")):
-                text = self._current_sentence.strip()
+                text = self._current_sentence
                 self._current_sentence = ""
 
         if text:
             await self._push_tts_frames(text)
 
     async def _push_tts_frames(self, text: str):
+        text = text.strip()
+        if not text:
+            return
+
         await self.push_frame(TTSStartedFrame())
         await self.process_generator(self.run_tts(text))
         await self.push_frame(TTSStoppedFrame())
@@ -133,14 +139,12 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
 
         if isinstance(frame, TextFrame):
             await self._process_text_frame(frame)
-        elif isinstance(frame, EndFrame):
-            if self._current_sentence:
-                await self._push_tts_frames(self._current_sentence)
-            await self.push_frame(frame)
-        elif isinstance(frame, LLMFullResponseEndFrame):
-            if self._current_sentence:
-                await self._push_tts_frames(self._current_sentence.strip())
-                self._current_sentence = ""
+        elif isinstance(frame, StartInterruptionFrame):
+            self._current_sentence = ""
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame):
+            self._current_sentence = ""
+            await self._push_tts_frames(self._current_sentence)
             await self.push_frame(frame)
         else:
             await self.push_frame(frame, direction)