fedirz · fedirz · Jul 20, 2024 · Jul 19, 2024 · Jul 20, 2024 · Jul 20, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,19 +25,16 @@ repos:
     hooks:
       - id: hadolint
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
     rev: v0.5.0
     hooks:
-      # Run the linter.
       - id: ruff
         args: [--fix]
-      # Run the formatter.
       - id: ruff-format
   - repo: https://github.com/google/yamlfmt
     rev: v0.13.0
     hooks:
       - id: yamlfmt
-  - repo: https://github.com/RobertCraigie/pyright-python
-    rev: v1.1.369
+  - repo: https://github.com/DetachHead/basedpyright-pre-commit-mirror
+    rev: v1.13.0
     hooks:
-      - id: pyright
+      - id: basedpyright
diff --git a/examples/live-audio/script.sh b/examples/live-audio/script.sh
@@ -16,4 +16,4 @@ docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface
 
 # `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le`
 # shellcheck disable=SC2002
-cat audio.pcm | pv -qL 32000 | websocat --no-close --binary ws://localhost:8000/v1/audio/transcriptions?language=en
+cat audio.pcm | pv -qL 32000 | websocat --no-close --binary 'ws://localhost:8000/v1/audio/transcriptions?language=en'
diff --git a/faster_whisper_server/asr.py b/faster_whisper_server/asr.py
@@ -1,11 +1,10 @@
 import asyncio
-from collections.abc import Iterable
 import time
 
 from faster_whisper import transcribe
 
 from faster_whisper_server.audio import Audio
-from faster_whisper_server.core import Transcription, Word
+from faster_whisper_server.core import Segment, Transcription, Word
 from faster_whisper_server.logger import logger
 
 
@@ -30,7 +29,8 @@ def _transcribe(
             word_timestamps=True,
             **self.transcribe_opts,
         )
-        words = words_from_whisper_segments(segments)
+        segments = Segment.from_faster_whisper_segments(segments)
+        words = Word.from_segments(segments)
         for word in words:
             word.offset(audio.start)
         transcription = Transcription(words)
@@ -54,19 +54,3 @@ async def transcribe(
             audio,
             prompt,
         )
-
-
-def words_from_whisper_segments(segments: Iterable[transcribe.Segment]) -> list[Word]:
-    words: list[Word] = []
-    for segment in segments:
-        assert segment.words is not None
-        words.extend(
-            Word(
-                start=word.start,
-                end=word.end,
-                text=word.word,
-                probability=word.probability,
-            )
-            for word in segment.words
-        )
-    return words
diff --git a/faster_whisper_server/config.py b/faster_whisper_server/config.py
@@ -15,35 +15,8 @@ class ResponseFormat(enum.StrEnum):
     TEXT = "text"
     JSON = "json"
     VERBOSE_JSON = "verbose_json"
-    # NOTE: While inspecting outputs of these formats with `curl`, I noticed there's one or two "\n" inserted at the end of the response. # noqa: E501
-
-    # VTT = "vtt" # TODO
-    # 1
-    # 00:00:00,000 --> 00:00:09,220
-    # In his video on Large Language Models or LLMs, OpenAI co-founder and YouTuber Andrej Karpathy
-    #
-    # 2
-    # 00:00:09,220 --> 00:00:12,280
-    # likened LLMs to operating systems.
-    #
-    # 3
-    # 00:00:12,280 --> 00:00:13,280
-    # Karpathy said,
-    #
-    # SRT = "srt" # TODO
-    # WEBVTT
-    #
-    # 00:00:00.000 --> 00:00:09.220
-    # In his video on Large Language Models or LLMs, OpenAI co-founder and YouTuber Andrej Karpathy
-    #
-    # 00:00:09.220 --> 00:00:12.280
-    # likened LLMs to operating systems.
-    #
-    # 00:00:12.280 --> 00:00:13.280
-    # Karpathy said,
-    #
-    # 00:00:13.280 --> 00:00:19.799
-    # I see a lot of equivalence between this new LLM OS and operating systems of today.
+    SRT = "srt"
+    VTT = "vtt"
 
 
 class Device(enum.StrEnum):
@@ -195,6 +168,14 @@ class Config(BaseSettings):
     model_config = SettingsConfigDict(env_nested_delimiter="__")
 
     log_level: str = "info"
+    host: str = Field(alias="UVICORN_HOST", default="0.0.0.0")
+    port: int = Field(alias="UVICORN_PORT", default=8000)
+
+    enable_ui: bool = True
+    """
+    Whether to enable the Gradio UI. You may want to disable this if you want to minimize the dependencies.
+    """
+
     default_language: Language | None = None
     default_response_format: ResponseFormat = ResponseFormat.JSON
     whisper: WhisperConfig = WhisperConfig()