Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactoring and new response types #42

Merged
merged 6 commits into from
Jul 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,16 @@ repos:
hooks:
- id: hadolint
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.5.0
hooks:
# Run the linter.
- id: ruff
args: [--fix]
# Run the formatter.
- id: ruff-format
- repo: https://github.com/google/yamlfmt
rev: v0.13.0
hooks:
- id: yamlfmt
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.369
- repo: https://github.com/DetachHead/basedpyright-pre-commit-mirror
rev: v1.13.0
hooks:
- id: pyright
- id: basedpyright
2 changes: 1 addition & 1 deletion examples/live-audio/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface

# `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le`
# shellcheck disable=SC2002
cat audio.pcm | pv -qL 32000 | websocat --no-close --binary ws://localhost:8000/v1/audio/transcriptions?language=en
cat audio.pcm | pv -qL 32000 | websocat --no-close --binary 'ws://localhost:8000/v1/audio/transcriptions?language=en'
22 changes: 3 additions & 19 deletions faster_whisper_server/asr.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import asyncio
from collections.abc import Iterable
import time

from faster_whisper import transcribe

from faster_whisper_server.audio import Audio
from faster_whisper_server.core import Transcription, Word
from faster_whisper_server.core import Segment, Transcription, Word
from faster_whisper_server.logger import logger


Expand All @@ -30,7 +29,8 @@ def _transcribe(
word_timestamps=True,
**self.transcribe_opts,
)
words = words_from_whisper_segments(segments)
segments = Segment.from_faster_whisper_segments(segments)
words = Word.from_segments(segments)
for word in words:
word.offset(audio.start)
transcription = Transcription(words)
Expand All @@ -54,19 +54,3 @@ async def transcribe(
audio,
prompt,
)


def words_from_whisper_segments(segments: Iterable[transcribe.Segment]) -> list[Word]:
words: list[Word] = []
for segment in segments:
assert segment.words is not None
words.extend(
Word(
start=word.start,
end=word.end,
text=word.word,
probability=word.probability,
)
for word in segment.words
)
return words
39 changes: 10 additions & 29 deletions faster_whisper_server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,8 @@ class ResponseFormat(enum.StrEnum):
TEXT = "text"
JSON = "json"
VERBOSE_JSON = "verbose_json"
# NOTE: While inspecting outputs of these formats with `curl`, I noticed there's one or two "\n" inserted at the end of the response. # noqa: E501

# VTT = "vtt" # TODO
# 1
# 00:00:00,000 --> 00:00:09,220
# In his video on Large Language Models or LLMs, OpenAI co-founder and YouTuber Andrej Karpathy
#
# 2
# 00:00:09,220 --> 00:00:12,280
# likened LLMs to operating systems.
#
# 3
# 00:00:12,280 --> 00:00:13,280
# Karpathy said,
#
# SRT = "srt" # TODO
# WEBVTT
#
# 00:00:00.000 --> 00:00:09.220
# In his video on Large Language Models or LLMs, OpenAI co-founder and YouTuber Andrej Karpathy
#
# 00:00:09.220 --> 00:00:12.280
# likened LLMs to operating systems.
#
# 00:00:12.280 --> 00:00:13.280
# Karpathy said,
#
# 00:00:13.280 --> 00:00:19.799
# I see a lot of equivalence between this new LLM OS and operating systems of today.
SRT = "srt"
VTT = "vtt"


class Device(enum.StrEnum):
Expand Down Expand Up @@ -195,6 +168,14 @@ class Config(BaseSettings):
model_config = SettingsConfigDict(env_nested_delimiter="__")

log_level: str = "info"
host: str = Field(alias="UVICORN_HOST", default="0.0.0.0")
port: int = Field(alias="UVICORN_PORT", default=8000)

enable_ui: bool = True
"""
Whether to enable the Gradio UI. You may want to disable this if you want to minimize the dependencies.
"""

default_language: Language | None = None
default_response_format: ResponseFormat = ResponseFormat.JSON
whisper: WhisperConfig = WhisperConfig()
Expand Down
Loading