-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwhisperize_content.py
executable file
·120 lines (100 loc) · 5.48 KB
/
whisperize_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/env python3
"""
Run whisper on the files
"""
import argparse
import json
import logging
from pathlib import Path
import subprocess
from utils import get_duration, human_time
import whisper
import torch
import time
from tempfile import NamedTemporaryFile
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--debug", default=False, action="store_true", help="Turn on debugging")
parser.add_argument("--resume", default=False, action="store_true", help="Resume where it left off")
parser.add_argument("workdir", type=Path, help="Root for the media files")
args = parser.parse_args()
logging.basicConfig(format="%(asctime)s [%(levelname)-8s] (%(filename)s:%(lineno)d:%(process)d) %(message)s",
level=logging.DEBUG if args.debug else logging.INFO)
if not args.workdir.is_dir():
logging.error("The workdir must be a directory")
exit(1)
workdir: Path = args.workdir
if args.resume:
# determine the time of the newest whisper file and set the cutoff time
# to one second prior to that (in case we were killed while writing
# that transcript file)
file_times = {'null_file': 0}
for f in workdir.glob("**/*.whisper.*.json"):
file_times[f.name] = f.stat().st_mtime
newest = max(file_times, key=file_times.get)
resume_time = file_times[newest] - 1
logging.info(f"Will resume for files that are newer than {newest} ({resume_time})")
else:
resume_time = 0
# Set up whisper
device = 'cuda' if torch.cuda.is_available() else 'cpu'
logging.info(f"Whisper will use {device} for computation")
# Run through all of the permutations...
for model_name in ('small', 'medium', 'large-v2', 'large-v3'):
logging.info(f"Loading model {model_name}")
model = whisper.load_model(model_name, device=device)
for media_file in workdir.glob("*/*"):
if media_file.suffix not in (".wav", ".mp4"):
logging.debug(f"Skipping file: {media_file}")
continue
for audio_filter, audio_filter_args in {'X': [],
'A': ['-af', 'afftdn=nr=10:nf=-25:tn=1'],
'B': ['-af', 'volume=4']}.items():
# Apply the audio filter.
with NamedTemporaryFile(suffix=".wav") as tempfile:
p = subprocess.run(['ffmpeg', '-y', '-i', str(media_file), *audio_filter_args,
'-c:a', 'pcm_s16le', '-ar', '44100', '-ac', '2', tempfile.name],
stdin=subprocess.DEVNULL, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, encoding='utf-8')
if p.returncode != 0:
logging.error(f"Cannot filter {media_file} with {audio_filter_args}: {p.stdout}")
continue
for previous_text in ('T', 'F'):
logging.info(f"Transcribing {media_file} with model {model_name}, previous_text {previous_text}, audio_filter: {audio_filter}")
whisper_file = media_file.with_suffix(f".whisper.{model_name}_{previous_text}_{audio_filter}.json")
if whisper_file.exists() and whisper_file.stat().st_mtime < resume_time:
logging.info(f"Skipping creation of {whisper_file.name} since it already exists")
continue
whisper_start = time.time()
duration = get_duration(media_file)
audio = whisper.load_audio(tempfile.name)
#detect_audio = whisper.pad_or_trim(audio)
#mel = whisper.log_mel_spectrogram(detect_audio,
# n_mels=128 if model_name in('large', 'large-v3') else 80,
# device=device).to(device)
#_, probs = model.detect_language(mel)
#language = max(probs, key=probs.get)
#logging.info(f"Detected language {language}")
language = "en"
res = whisper.transcribe(model, audio,
word_timestamps=True,
language=language,
condition_on_previous_text=previous_text==True)
res['_job'] = {
'runtime': time.time() - whisper_start,
'duration': duration,
'device': device,
'language': language,
'model': model_name,
'previous_text': previous_text,
'audio_filter': audio_filter
}
with open(whisper_file, "w") as f:
json.dump(res, f, indent=4)
logging.info(f"Finished transcribing {media_file}. {human_time(duration)} of content in {human_time(res['_job']['runtime'])}")
logging.info(f"Unloading Whisper Model")
del(model)
if device == "cuda":
torch.cuda.empty_cache()
if __name__ == "__main__":
main()