Skip to content

Commit

Permalink
Add error_threshold param to ffmpeg conversion jobs (#460)
Browse files Browse the repository at this point in the history
Co-authored-by: Albert Zeyer <[email protected]>
  • Loading branch information
curufinwe and albertz authored Nov 22, 2023
1 parent e5c9e24 commit 80bcfb2
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 6 deletions.
7 changes: 5 additions & 2 deletions audio/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
recover_duration: Optional[bool] = None,
input_codec: Optional[str] = None,
input_codec_options: Optional[List[str]] = None,
error_threshold: int = 0,
):
"""
For all parameter holds that "None" means to use the ffmpeg defaults, which depend on the input file
Expand All @@ -55,8 +56,9 @@ def __init__(
There might be minimal differences when converting the encoding, so only set this to `False` if you're
willing to accept this risk. `None` (default) means that the duration is recovered if either `output_format`
or `codec` is specified because this might possibly lead to duration mismatches.
:param in_codec: specify the codec of the input file
:param in_codec_options: specify additional codec specific options for the in_codec
:param input_codec: specify the codec of the input file
:param input_codec_options: specify additional codec specific options for the in_codec
:param error_threshold: Allow upto this many files to fail conversion before failing this job
"""
ffmpeg_input_options = []
ffmpeg_options = []
Expand Down Expand Up @@ -104,4 +106,5 @@ def __init__(
output_format=output_format,
ffmpeg_binary=ffmpeg_binary,
hash_binary=hash_binary,
error_threshold=error_threshold,
)
28 changes: 24 additions & 4 deletions audio/ffmpeg.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def __init__(
ffmpeg_binary: Optional[Union[str, tk.Path]] = None,
hash_binary: bool = False,
ffmpeg_input_options: Optional[List[str]] = None,
error_threshold: int = 0,
):
"""
Expand All @@ -92,6 +93,7 @@ def __init__(
:param hash_binary: In some cases it might be required to work with a specific ffmpeg version,
in which case the binary needs to be hashed
:param ffmpeg_input_options: list of ffmpeg parameters thare are applied for reading the input files
:param error_threshold: Allow upto this many files to fail conversion before failing this job
"""
self.corpus_file = corpus_file
self.ffmpeg_input_options = ffmpeg_input_options
Expand All @@ -100,9 +102,15 @@ def __init__(
self.output_format = output_format
self.ffmpeg_binary = ffmpeg_binary if ffmpeg_binary else "ffmpeg"
self.hash_binary = hash_binary
self.error_threshold = error_threshold
self.num_errors = 0
self.failed_files = []

self.out_audio_folder = self.output_path("audio/", directory=True)
self.out_corpus = self.output_path("corpus.xml.gz")
self.out_failed_files = None
if self.error_threshold > 0:
self.out_failed_files = self.output_path("failed_files.txt")

self.rqmt = {"time": 4, "cpu": 4, "mem": 8}

Expand Down Expand Up @@ -132,6 +140,10 @@ def run(self):
else:
c.dump(tk.uncached_path(self.out_corpus))

if self.out_failed_files is not None:
with open(self.out_failed_files.get_path(), "wt") as out:
out.write("\n".join(self.failed_files))

def run_recover_duration(self):
"""
Open all files with "soundfile" and extract the length information
Expand Down Expand Up @@ -189,13 +201,19 @@ def _perform_ffmpeg(self, recording: corpus.Recording):
"-y",
]
command_in = ["-i", recording.audio]
command_out = [
os.path.join(self.out_audio_folder.get_path(), audio_filename),
]
command_out = [target]
in_options = self.ffmpeg_input_options or []
out_options = self.ffmpeg_options or []
command = command_head + in_options + command_in + out_options + command_out
subprocess.check_call(command)
ret = subprocess.run(command, check=False)
if ret.returncode != 0:
self.num_errors += 1
os.remove(target)
self.failed_files.append(recording.audio)
if self.num_errors > self.error_threshold:
with open(self.out_failed_files.get_path(), "wt") as out:
out.write("\n".join(self.failed_files))
raise subprocess.SubprocessError("Error threshold exceeded")
else:
logging.info(f"skipped existing {target}")

Expand All @@ -204,4 +222,6 @@ def hash(cls, kwargs):
d = copy.copy(kwargs)
if not kwargs["hash_binary"]:
d.pop("ffmpeg_binary")
if kwargs["error_threshold"] == 0:
d.pop("error_threshold")
return super().hash(d)

0 comments on commit 80bcfb2

Please sign in to comment.