Skip to content

Commit

Permalink
introduce input/output audio and image frames
Browse files Browse the repository at this point in the history
  • Loading branch information
aconchillo committed Sep 20, 2024
1 parent ed409d0 commit d654208
Show file tree
Hide file tree
Showing 29 changed files with 226 additions and 139 deletions.
12 changes: 9 additions & 3 deletions examples/foundational/05a-local-sync-speech-and-image.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@

import tkinter as tk

from pipecat.frames.frames import AudioRawFrame, Frame, URLImageRawFrame, LLMMessagesFrame, TextFrame
from pipecat.frames.frames import (
Frame,
OutputAudioRawFrame,
TTSAudioRawFrame,
URLImageRawFrame,
LLMMessagesFrame,
TextFrame)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.sync_parallel_pipeline import SyncParallelPipeline
Expand Down Expand Up @@ -65,9 +71,9 @@ def __init__(self):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

if isinstance(frame, AudioRawFrame):
if isinstance(frame, TTSAudioRawFrame):
self.audio.extend(frame.audio)
self.frame = AudioRawFrame(
self.frame = OutputAudioRawFrame(
bytes(self.audio), frame.sample_rate, frame.num_channels)

class ImageGrabber(FrameProcessor):
Expand Down
13 changes: 10 additions & 3 deletions examples/foundational/06a-image-sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from PIL import Image

from pipecat.frames.frames import ImageRawFrame, Frame, SystemFrame, TextFrame
from pipecat.frames.frames import Frame, OutputImageRawFrame, SystemFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask
Expand Down Expand Up @@ -52,9 +52,16 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

if not isinstance(frame, SystemFrame) and direction == FrameDirection.DOWNSTREAM:
await self.push_frame(ImageRawFrame(image=self._speaking_image_bytes, size=(1024, 1024), format=self._speaking_image_format))
await self.push_frame(OutputImageRawFrame(
image=self._speaking_image_bytes,
size=(1024, 1024),
format=self._speaking_image_format)
)
await self.push_frame(frame)
await self.push_frame(ImageRawFrame(image=self._waiting_image_bytes, size=(1024, 1024), format=self._waiting_image_format))
await self.push_frame(OutputImageRawFrame(
image=self._waiting_image_bytes,
size=(1024, 1024),
format=self._waiting_image_format))
else:
await self.push_frame(frame)

Expand Down
6 changes: 3 additions & 3 deletions examples/foundational/11-sound-effects.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

from pipecat.frames.frames import (
Frame,
AudioRawFrame,
LLMFullResponseEndFrame,
LLMMessagesFrame,
OutputAudioRawFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
Expand Down Expand Up @@ -53,8 +53,8 @@
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[file] = AudioRawFrame(audio_file.readframes(-1),
audio_file.getframerate(), audio_file.getnchannels())
sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
audio_file.getframerate(), audio_file.getnchannels())


class OutboundSoundEffectWrapper(FrameProcessor):
Expand Down
11 changes: 8 additions & 3 deletions examples/moondream-chatbot/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@

from pipecat.frames.frames import (
ImageRawFrame,
OutputImageRawFrame,
SpriteFrame,
Frame,
LLMMessagesFrame,
AudioRawFrame,
TTSAudioRawFrame,
TTSStoppedFrame,
TextFrame,
UserImageRawFrame,
Expand Down Expand Up @@ -59,7 +60,11 @@
# Get the filename without the extension to use as the dictionary key
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
sprites.append(OutputImageRawFrame(
image=img.tobytes(),
size=img.size,
format=img.format)
)

flipped = sprites[::-1]
sprites.extend(flipped)
Expand All @@ -82,7 +87,7 @@ def __init__(self):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

if isinstance(frame, AudioRawFrame):
if isinstance(frame, TTSAudioRawFrame):
if not self._is_talking:
await self.push_frame(talking_frame)
self._is_talking = True
Expand Down
7 changes: 4 additions & 3 deletions examples/patient-intake/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sys
import wave

from pipecat.frames.frames import AudioRawFrame
from pipecat.frames.frames import OutputAudioRawFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
Expand Down Expand Up @@ -49,8 +49,9 @@
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the sound and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[file] = AudioRawFrame(audio_file.readframes(-1),
audio_file.getframerate(), audio_file.getnchannels())
sounds[file] = OutputAudioRawFrame(audio_file.readframes(-1),
audio_file.getframerate(),
audio_file.getnchannels())


class IntakeProcessor:
Expand Down
12 changes: 8 additions & 4 deletions examples/simple-chatbot/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_response import LLMAssistantResponseAggregator, LLMUserResponseAggregator
from pipecat.frames.frames import (
AudioRawFrame,
ImageRawFrame,
OutputImageRawFrame,
SpriteFrame,
Frame,
LLMMessagesFrame,
TTSAudioRawFrame,
TTSStoppedFrame
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
Expand Down Expand Up @@ -49,7 +49,11 @@
# Get the filename without the extension to use as the dictionary key
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites.append(ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
sprites.append(OutputImageRawFrame(
image=img.tobytes(),
size=img.size,
format=img.format)
)

flipped = sprites[::-1]
sprites.extend(flipped)
Expand All @@ -72,7 +76,7 @@ def __init__(self):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

if isinstance(frame, AudioRawFrame):
if isinstance(frame, TTSAudioRawFrame):
if not self._is_talking:
await self.push_frame(talking_frame)
self._is_talking = True
Expand Down
11 changes: 6 additions & 5 deletions examples/storytelling-chatbot/src/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import wave
from PIL import Image

from pipecat.frames.frames import AudioRawFrame, ImageRawFrame
from pipecat.frames.frames import OutputAudioRawFrame, OutputImageRawFrame

script_dir = os.path.dirname(__file__)

Expand All @@ -16,7 +16,8 @@ def load_images(image_files):
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the image and convert it to bytes
with Image.open(full_path) as img:
images[filename] = ImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)
images[filename] = OutputImageRawFrame(
image=img.tobytes(), size=img.size, format=img.format)
return images


Expand All @@ -30,8 +31,8 @@ def load_sounds(sound_files):
filename = os.path.splitext(os.path.basename(full_path))[0]
# Open the sound and convert it to bytes
with wave.open(full_path) as audio_file:
sounds[filename] = AudioRawFrame(audio=audio_file.readframes(-1),
sample_rate=audio_file.getframerate(),
num_channels=audio_file.getnchannels())
sounds[filename] = OutputAudioRawFrame(audio=audio_file.readframes(-1),
sample_rate=audio_file.getframerate(),
num_channels=audio_file.getnchannels())

return sounds
66 changes: 49 additions & 17 deletions src/pipecat/frames/frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@ class DataFrame(Frame):

@dataclass
class AudioRawFrame(DataFrame):
"""A chunk of audio. Will be played by the transport if the transport's
microphone has been enabled.
"""
"""A chunk of audio."""
audio: bytes
sample_rate: int
num_channels: int
Expand All @@ -58,6 +55,31 @@ def __str__(self):
return f"{self.name}(pts: {pts}, size: {len(self.audio)}, frames: {self.num_frames}, sample_rate: {self.sample_rate}, channels: {self.num_channels})"


@dataclass
class InputAudioRawFrame(AudioRawFrame):
"""A chunk of audio usually coming from an input transport.
"""
pass


@dataclass
class OutputAudioRawFrame(AudioRawFrame):
"""A chunk of audio. Will be played by the output transport if the
transport's microphone has been enabled.
"""
pass


@dataclass
class TTSAudioRawFrame(OutputAudioRawFrame):
"""A chunk of output audio generated by a TTS service.
"""
pass


@dataclass
class ImageRawFrame(DataFrame):
"""An image. Will be shown by the transport if the transport's camera is
Expand All @@ -74,20 +96,30 @@ def __str__(self):


@dataclass
class URLImageRawFrame(ImageRawFrame):
"""An image with an associated URL. Will be shown by the transport if the
class InputImageRawFrame(ImageRawFrame):
pass


@dataclass
class OutputImageRawFrame(ImageRawFrame):
pass


@dataclass
class UserImageRawFrame(InputImageRawFrame):
"""An image associated to a user. Will be shown by the transport if the
transport's camera is enabled.
"""
url: str | None
user_id: str

def __str__(self):
pts = format_pts(self.pts)
return f"{self.name}(pts: {pts}, url: {self.url}, size: {self.size}, format: {self.format})"
return f"{self.name}(pts: {pts}, user: {self.user_id}, size: {self.size}, format: {self.format})"


@dataclass
class VisionImageRawFrame(ImageRawFrame):
class VisionImageRawFrame(InputImageRawFrame):
"""An image with an associated text to ask for a description of it. Will be
shown by the transport if the transport's camera is enabled.
Expand All @@ -100,16 +132,16 @@ def __str__(self):


@dataclass
class UserImageRawFrame(ImageRawFrame):
"""An image associated to a user. Will be shown by the transport if the
class URLImageRawFrame(OutputImageRawFrame):
"""An image with an associated URL. Will be shown by the transport if the
transport's camera is enabled.
"""
user_id: str
url: str | None

def __str__(self):
pts = format_pts(self.pts)
return f"{self.name}(pts: {pts}, user: {self.user_id}, size: {self.size}, format: {self.format})"
return f"{self.name}(pts: {pts}, url: {self.url}, size: {self.size}, format: {self.format})"


@dataclass
Expand Down Expand Up @@ -420,10 +452,10 @@ class BotSpeakingFrame(ControlFrame):
@dataclass
class TTSStartedFrame(ControlFrame):
"""Used to indicate the beginning of a TTS response. Following
AudioRawFrames are part of the TTS response until an TTSStoppedFrame. These
frames can be used for aggregating audio frames in a transport to optimize
the size of frames sent to the session, without needing to control this in
the TTS service.
TTSAudioRawFrames are part of the TTS response until an
TTSStoppedFrame. These frames can be used for aggregating audio frames in a
transport to optimize the size of frames sent to the session, without
needing to control this in the TTS service.
"""
pass
Expand Down
6 changes: 5 additions & 1 deletion src/pipecat/processors/aggregators/openai_llm_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@

from PIL import Image

from pipecat.frames.frames import Frame, VisionImageRawFrame, FunctionCallInProgressFrame, FunctionCallResultFrame
from pipecat.frames.frames import (
Frame,
VisionImageRawFrame,
FunctionCallInProgressFrame,
FunctionCallResultFrame)
from pipecat.processors.frame_processor import FrameProcessor

from loguru import logger
Expand Down
12 changes: 9 additions & 3 deletions src/pipecat/processors/aggregators/vision_image_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
# SPDX-License-Identifier: BSD 2-Clause License
#

from pipecat.frames.frames import Frame, ImageRawFrame, TextFrame, VisionImageRawFrame
from pipecat.frames.frames import (
Frame,
InputImageRawFrame,
TextFrame,
VisionImageRawFrame
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor


class VisionImageFrameAggregator(FrameProcessor):
"""This aggregator waits for a consecutive TextFrame and an
ImageFrame. After the ImageFrame arrives it will output a VisionImageFrame.
InputImageRawFrame. After the InputImageRawFrame arrives it will output a
VisionImageRawFrame.
>>> from pipecat.frames.frames import ImageFrame
Expand All @@ -34,7 +40,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):

if isinstance(frame, TextFrame):
self._describe_text = frame.text
elif isinstance(frame, ImageRawFrame):
elif isinstance(frame, InputImageRawFrame):
if self._describe_text:
frame = VisionImageRawFrame(
text=self._describe_text,
Expand Down
12 changes: 6 additions & 6 deletions src/pipecat/processors/gstreamer/pipeline_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from pydantic import BaseModel

from pipecat.frames.frames import (
AudioRawFrame,
CancelFrame,
EndFrame,
Frame,
ImageRawFrame,
OutputAudioRawFrame,
OutputImageRawFrame,
StartFrame,
SystemFrame)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
Expand Down Expand Up @@ -182,17 +182,17 @@ def _decodebin_video(self, pad: Gst.Pad):
def _appsink_audio_new_sample(self, appsink: GstApp.AppSink):
buffer = appsink.pull_sample().get_buffer()
(_, info) = buffer.map(Gst.MapFlags.READ)
frame = AudioRawFrame(audio=info.data,
sample_rate=self._out_params.audio_sample_rate,
num_channels=self._out_params.audio_channels)
frame = OutputAudioRawFrame(audio=info.data,
sample_rate=self._out_params.audio_sample_rate,
num_channels=self._out_params.audio_channels)
asyncio.run_coroutine_threadsafe(self.push_frame(frame), self.get_event_loop())
buffer.unmap(info)
return Gst.FlowReturn.OK

def _appsink_video_new_sample(self, appsink: GstApp.AppSink):
buffer = appsink.pull_sample().get_buffer()
(_, info) = buffer.map(Gst.MapFlags.READ)
frame = ImageRawFrame(
frame = OutputImageRawFrame(
image=info.data,
size=(self._out_params.video_width, self._out_params.video_height),
format="RGB")
Expand Down
Loading

0 comments on commit d654208

Please sign in to comment.