diff --git a/assemblyai/api.py b/assemblyai/api.py index d3d87d2..1362204 100644 --- a/assemblyai/api.py +++ b/assemblyai/api.py @@ -27,7 +27,7 @@ def _get_error_message(response: httpx.Response) -> str: try: return response.json()["error"] except Exception: - return response.text + return f"\nReason: {response.text}\nRequest: {response.request}" def create_transcript( @@ -43,7 +43,7 @@ def create_transcript( ) if response.status_code != httpx.codes.ok: raise types.TranscriptError( - f"failed to transcript url {request.audio_url}: {_get_error_message(response)}" + f"failed to transcribe url {request.audio_url}: {_get_error_message(response)}" ) return types.TranscriptResponse.parse_obj(response.json()) diff --git a/assemblyai/transcriber.py b/assemblyai/transcriber.py index 913563b..e64d482 100644 --- a/assemblyai/transcriber.py +++ b/assemblyai/transcriber.py @@ -1,6 +1,5 @@ from __future__ import annotations -import base64 import concurrent.futures import functools import json @@ -987,6 +986,7 @@ def __init__( encoding: Optional[types.AudioEncoding] = None, token: Optional[str] = None, client: _client.Client, + end_utterance_silence_threshold: Optional[int], ) -> None: self._client = client self._websocket: Optional[websockets.sync.client.ClientConnection] = None @@ -999,8 +999,9 @@ def __init__( self._word_boost = word_boost self._encoding = encoding self._token = token + self._end_utterance_silence_threshold = end_utterance_silence_threshold - self._write_queue: queue.Queue[bytes] = queue.Queue() + self._write_queue: queue.Queue[Union[bytes, Dict]] = queue.Queue() self._write_thread = threading.Thread(target=self._write) self._read_thread = threading.Thread(target=self._read) self._stop_event = threading.Event() @@ -1048,6 +1049,11 @@ def connect( self._read_thread.start() self._write_thread.start() + if self._end_utterance_silence_threshold is not None: + self.configure_end_utterance_silence_threshold( + self._end_utterance_silence_threshold + ) + def stream(self, data: bytes) -> None: """ Streams audio data to the real-time service by putting it into a queue. @@ -1055,6 +1061,28 @@ def stream(self, data: bytes) -> None: self._write_queue.put(data) + def configure_end_utterance_silence_threshold( + self, threshold_milliseconds: int + ) -> None: + """ + Configures the end of utterance silence threshold. + Can be called multiple times during a session at any point after the session starts. + + Args: + `threshold_milliseconds`: The threshold in milliseconds. + """ + + self._write_queue.put( + _RealtimeEndUtteranceSilenceThreshold(threshold_milliseconds).as_dict() + ) + + def force_end_utterance(self) -> None: + """ + Forces the end of the current utterance. + """ + + self._write_queue.put(_RealtimeForceEndUtterance().as_dict()) + def close(self, terminate: bool = False) -> None: """ Closes the connection to the real-time service gracefully. @@ -1116,25 +1144,12 @@ def _write(self) -> None: if isinstance(data, dict): self._websocket.send(json.dumps(data)) elif isinstance(data, bytes): - self._websocket.send(self._encode_data(data)) + self._websocket.send(data) else: raise ValueError("unsupported message type") except websockets.exceptions.ConnectionClosed as exc: return self._handle_error(exc) - def _encode_data(self, data: bytes) -> str: - """ - Encodes the given audio chunk as a base64 string. - - This is a helper method for `_write`. - """ - - return json.dumps( - { - "audio_data": base64.b64encode(data).decode("utf-8"), - } - ) - def _handle_message( self, message: Dict[str, Any], @@ -1208,6 +1223,25 @@ def create_temporary_token( ) +class _RealtimeForceEndUtterance: + def as_dict(self) -> Dict[str, bool]: + return { + "force_end_utterance": True, + } + + +class _RealtimeEndUtteranceSilenceThreshold: + def __init__(self, threshold_milliseconds: int) -> None: + self._value = threshold_milliseconds + + @property + def value(self) -> int: + return self._value + + def as_dict(self) -> Dict[str, int]: + return {"end_utterance_silence_threshold": self._value} + + class RealtimeTranscriber: def __init__( self, @@ -1221,6 +1255,7 @@ def __init__( encoding: Optional[types.AudioEncoding] = None, token: Optional[str] = None, client: Optional[_client.Client] = None, + end_utterance_silence_threshold: Optional[int] = None, ) -> None: """ Creates a new real-time transcriber. @@ -1235,6 +1270,7 @@ def __init__( `encoding`: (Optional) The encoding of the audio data. `token`: (Optional) A temporary authentication token. `client`: (Optional) The client to use for the real-time service. + `end_utterance_silence_threshold`: (Optional) The end utterance silence threshold in milliseconds. """ self._client = client or _client.Client.get_default( @@ -1251,6 +1287,7 @@ def __init__( encoding=encoding, token=token, client=self._client, + end_utterance_silence_threshold=end_utterance_silence_threshold, ) def connect( @@ -1268,8 +1305,7 @@ def connect( self._impl.connect(timeout=timeout) def stream( - self, - data: Union[bytes, Generator[bytes, None, None], Iterable[bytes]], + self, data: Union[bytes, Generator[bytes, None, None], Iterable[bytes]] ) -> None: """ Streams raw audio data to the real-time service. @@ -1286,6 +1322,26 @@ def stream( for chunk in data: self._impl.stream(chunk) + def configure_end_utterance_silence_threshold( + self, threshold_milliseconds: int + ) -> None: + """ + Configures the silence duration threshold used to detect the end of an utterance. + In practice, it's used to tune how the transcriptions are split into final transcripts. + Can be called multiple times during a session at any point after the session starts. + + Args: + `threshold_milliseconds`: The threshold in milliseconds. + """ + self._impl.configure_end_utterance_silence_threshold(threshold_milliseconds) + + def force_end_utterance(self) -> None: + """ + Forces the end of the current utterance. + After calling this method, the server will end the current utterance and return a final transcript. + """ + self._impl.force_end_utterance() + def close(self) -> None: """ Closes the connection to the real-time service. diff --git a/setup.py b/setup.py index 5411478..2b0e666 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="assemblyai", - version="0.21.0", + version="0.22.0", description="AssemblyAI Python SDK", author="AssemblyAI", author_email="engineering.sdk@assemblyai.com", diff --git a/tests/unit/test_realtime_transcriber.py b/tests/unit/test_realtime_transcriber.py index 9c46d00..aa4da55 100644 --- a/tests/unit/test_realtime_transcriber.py +++ b/tests/unit/test_realtime_transcriber.py @@ -274,38 +274,10 @@ def mocked_send(data: str): transcriber._impl._write() - # assert that the correct data was sent (base64 encoded) + # assert that the correct data was sent (= the exact input bytes) assert len(actual_sent) == 2 - assert json.loads(actual_sent[0]) == {"audio_data": "AQIDBAU="} - assert json.loads(actual_sent[1]) == {"audio_data": "BgcICQo="} - - -def test_realtime__encode_data(mocker: MockFixture): - """ - Tests the `_encode_data` method of the `_RealtimeTranscriberImpl` class. - """ - - audio_chunks = [ - bytes([1, 2, 3, 4, 5]), - bytes([6, 7, 8, 9, 10]), - ] - - expected_encoded_data = [ - json.dumps({"audio_data": "AQIDBAU="}), - json.dumps({"audio_data": "BgcICQo="}), - ] - - transcriber = aai.RealtimeTranscriber( - on_data=lambda _: None, - on_error=lambda _: None, - sample_rate=44_100, - ) - - actual_encoded_data = [] - for chunk in audio_chunks: - actual_encoded_data.append(transcriber._impl._encode_data(chunk)) - - assert actual_encoded_data == expected_encoded_data + assert actual_sent[0] == audio_chunks[0] + assert actual_sent[1] == audio_chunks[1] def test_realtime__handle_message_session_begins(mocker: MockFixture):