diff --git a/CHANGELOG.md b/CHANGELOG.md index c85d9685a..77c66e937 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed an issue that could mix new LLM responses with previous ones when + handling interruptions. + - Fixed a Daily transport blocking situation that occurred while reading audio frames after a participant left the room. Needs daily-python >= 0.10.1. diff --git a/src/pipecat/services/ai_services.py b/src/pipecat/services/ai_services.py index 148fc62f3..95ba2e86a 100644 --- a/src/pipecat/services/ai_services.py +++ b/src/pipecat/services/ai_services.py @@ -16,7 +16,9 @@ EndFrame, ErrorFrame, Frame, + LLMFullResponseStartFrame, StartFrame, + StartInterruptionFrame, TTSStartedFrame, TTSStoppedFrame, TextFrame, @@ -114,13 +116,17 @@ async def _process_text_frame(self, frame: TextFrame): if self._current_sentence.strip().endswith( (".", "?", "!")) and not self._current_sentence.strip().endswith( ("Mr,", "Mrs.", "Ms.", "Dr.")): - text = self._current_sentence.strip() + text = self._current_sentence self._current_sentence = "" if text: await self._push_tts_frames(text) async def _push_tts_frames(self, text: str): + text = text.strip() + if not text: + return + await self.push_frame(TTSStartedFrame()) await self.process_generator(self.run_tts(text)) await self.push_frame(TTSStoppedFrame()) @@ -133,14 +139,12 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): if isinstance(frame, TextFrame): await self._process_text_frame(frame) - elif isinstance(frame, EndFrame): - if self._current_sentence: - await self._push_tts_frames(self._current_sentence) - await self.push_frame(frame) - elif isinstance(frame, LLMFullResponseEndFrame): - if self._current_sentence: - await self._push_tts_frames(self._current_sentence.strip()) - self._current_sentence = "" + elif isinstance(frame, StartInterruptionFrame): + self._current_sentence = "" + await self.push_frame(frame, direction) + elif isinstance(frame, LLMFullResponseEndFrame) or isinstance(frame, EndFrame): + self._current_sentence = "" + await self._push_tts_frames(self._current_sentence) await self.push_frame(frame) else: await self.push_frame(frame, direction)