From dfeb78d044c925b8da03acd80f16ec2a23c72405 Mon Sep 17 00:00:00 2001 From: pprobst Date: Thu, 19 Oct 2023 16:59:07 -0300 Subject: [PATCH] Add more transforms --- audio/aug.py | 34 +++++++++++++++++++++++++++------- requirements.txt | 1 + run_audio_aug.py | 15 +++++++++++---- utils/files.py | 6 +++--- 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/audio/aug.py b/audio/aug.py index c965865..04d7cde 100644 --- a/audio/aug.py +++ b/audio/aug.py @@ -6,10 +6,32 @@ from typing import List AUG_PARAMS = { + # See a list of possible transforms here: https://iver56.github.io/audiomentations/ + # "p" is the probability of applying the transform "AddGaussianNoise": {"min_amplitude": 0.001, "max_amplitude": 0.015, "p": 1.0}, + "AddGaussianSNR": {"min_snr_db": 5.0, "max_snr_db": 40.0, "p": 0.5}, + "ClippingDistortion": { + "min_percentile_threshold": 0, + "max_percentile_threshold": 40, + "p": 0.5, + }, + "Gain": { + "min_gain_db": -12.0, + "max_gain_db": 12.0, + "p": 0.5, + }, + "GainTransition": { + "min_gain_db": -24.0, + "max_gain_db": 10.0, + "min_duration": 0.25, + "max_duration": 0.25, + "duration_unit": "fraction", + "p": 0.5, + }, + "Normalize": {"p": 0.5}, "TimeStretch": {"min_rate": 0.8, "max_rate": 1.25, "p": 0.5}, - "PitchShift": {"min_semitones": -4, "max_semitones": 4, "p": 0.5}, - "Shift": {"min_fraction": -0.5, "max_fraction": 0.5, "p": 0.5}, + "PitchShift": {"min_semitones": -0.5, "max_semitones": 0.5, "p": 0.5}, + "Shift": {"min_shift": -0.5, "max_shift": 0.5, "p": 0.5}, } @@ -30,12 +52,10 @@ def apply_augmentation( print(f"Invalid augmentation technique: {augmentation_name}") exit(1) - print("Augmentations: ", augmentation_list) augment = Compose(augmentation_list) - - # for transform in augment.transforms: - # print(f"{transform.__class__.__name__}: {transform.parameters}") - augmented_samples = augment(samples=samples, sample_rate=sample_rate) + for transform in augment.transforms: + print(f"{transform.__class__.__name__}: {transform.parameters}") + return augmented_samples diff --git a/requirements.txt b/requirements.txt index 8943971..9090607 100755 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ nltk num2words tqdm gensim +soundfile librosa pydub audiomentations diff --git a/run_audio_aug.py b/run_audio_aug.py index 024b9ec..61b55b5 100755 --- a/run_audio_aug.py +++ b/run_audio_aug.py @@ -2,7 +2,8 @@ import argparse import os -import audiomentations as AA +import random +import numpy as np from audio.aug import apply_augmentation from utils.files import load_audio, save_audio @@ -32,12 +33,18 @@ nargs="+", help="Audiomentation techniques (e.g., AddGaussianNoise, PitchShift, TimeStretch)", ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed for reproducible augmentations (default: None, generates a random seed)", + ) args = parser.parse_args() - if not os.path.exists(args.input_file): - print(f"Input file '{args.input_file}' does not exist.") - exit(1) + if args.seed is not None: + random.seed(args.seed) + np.random.seed(args.seed) audio, sr = load_audio(args.input_file) diff --git a/utils/files.py b/utils/files.py index 1876827..b77e773 100644 --- a/utils/files.py +++ b/utils/files.py @@ -7,7 +7,7 @@ from pathlib import Path from pydub import AudioSegment -from typing import List, Optional, Union +from typing import List, Optional, Union, Tuple from .text import pre_process_sentences @@ -90,7 +90,7 @@ def download_and_extract(url: str, target_file: str) -> None: ) -def load_audio(audio_file: str) -> Union[tuple, None]: +def load_audio(audio_file: str) -> Tuple[np.ndarray, float]: """ Read an audio file using Librosa. Convert to WAV if not in WAV format. @@ -115,7 +115,7 @@ def load_audio(audio_file: str) -> Union[tuple, None]: return y, sr except Exception as e: print(f"Error loading or converting the audio file: {e}") - return None + raise e def save_audio(