forked from ufal/whisper_streaming
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathserver.py
89 lines (73 loc) · 2.89 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import io
import logging
import socket
import librosa
import numpy as np
import soundfile
from whisper_online import SAMPLING_RATE, FasterWhisperASR, OnlineASRProcessor
HOST = "localhost"
PORT = 65432
LOG_LEVEL = logging.INFO
BUFFER_SIZE = 4096
MIN_SECONDS_TO_PROCESS = 3
MIN_SAMPLES_TO_PROCESS = SAMPLING_RATE * MIN_SECONDS_TO_PROCESS
logging.basicConfig(
level=LOG_LEVEL, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
logger.setLevel(LOG_LEVEL)
asr = FasterWhisperASR(
language="en",
model_size="tiny.en",
)
def timestamped_segments_to_text(segments):
return "".join([segment[2] for segment in segments])
def process_incoming_data(data: bytes) -> np.ndarray:
sf = soundfile.SoundFile(
io.BytesIO(data),
channels=1,
endian="LITTLE",
samplerate=SAMPLING_RATE,
subtype="PCM_16",
format="RAW",
)
audio, _ = librosa.load(sf, sr=SAMPLING_RATE, dtype=np.float32)
return audio
def handle_connection(client_socket: socket.socket):
online_asr = OnlineASRProcessor(asr, buffer_trimming_sec=15)
samples = np.array([], dtype=np.float32)
while True:
data = client_socket.recv(BUFFER_SIZE)
if not data:
online_asr.finish()
# commited_text = timestamped_segments_to_text(online_asr.commited)
# buffered_text = timestamped_segments_to_text(online_asr.transcript_buffer)
# logger.info(f"online_asr.commited: {commited_text}")
# logger.info(f"online_asr.transcript_buffer: {buffered_text}")
# return "".join(commited_text + buffered_text)
new_samples = process_incoming_data(data)
samples = np.append(samples, new_samples)
if len(samples) > MIN_SAMPLES_TO_PROCESS:
logger.info(f"Transcribing {len(samples)} samples")
online_asr.insert_audio_chunk(samples.copy())
online_asr.process_iter()
samples = np.array([], dtype=np.float32)
logger.info(
f"Transcription so far: {timestamped_segments_to_text(online_asr.commited)}"
)
client_socket.sendall(
timestamped_segments_to_text(online_asr.commited).encode("utf-8")
)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server_socket:
logger.info(f"Server listening on {HOST}:{PORT}")
server_socket.bind((HOST, PORT))
server_socket.listen()
while True:
client_socket, client_addr = server_socket.accept()
logger.info(f"Received connection from {client_socket.getpeername()}")
try:
transcription = handle_connection(client_socket)
except ConnectionResetError:
logger.info(f"Connection reset by {client_socket.getpeername()}")
# logger.info(f"Final transcription: {transcription}")
# client_socket.sendall(transcription.encode("utf-8"))