From dfeb78d044c925b8da03acd80f16ec2a23c72405 Mon Sep 17 00:00:00 2001
From: pprobst <pprobst@insiberia.net>
Date: Thu, 19 Oct 2023 16:59:07 -0300
Subject: [PATCH] Add more transforms

---
 audio/aug.py     | 34 +++++++++++++++++++++++++++-------
 requirements.txt |  1 +
 run_audio_aug.py | 15 +++++++++++----
 utils/files.py   |  6 +++---
 4 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/audio/aug.py b/audio/aug.py
index c965865..04d7cde 100644
--- a/audio/aug.py
+++ b/audio/aug.py
@@ -6,10 +6,32 @@
 from typing import List
 
 AUG_PARAMS = {
+    # See a list of possible transforms here: https://iver56.github.io/audiomentations/
+    # "p" is the probability of applying the transform
     "AddGaussianNoise": {"min_amplitude": 0.001, "max_amplitude": 0.015, "p": 1.0},
+    "AddGaussianSNR": {"min_snr_db": 5.0, "max_snr_db": 40.0, "p": 0.5},
+    "ClippingDistortion": {
+        "min_percentile_threshold": 0,
+        "max_percentile_threshold": 40,
+        "p": 0.5,
+    },
+    "Gain": {
+        "min_gain_db": -12.0,
+        "max_gain_db": 12.0,
+        "p": 0.5,
+    },
+    "GainTransition": {
+        "min_gain_db": -24.0,
+        "max_gain_db": 10.0,
+        "min_duration": 0.25,
+        "max_duration": 0.25,
+        "duration_unit": "fraction",
+        "p": 0.5,
+    },
+    "Normalize": {"p": 0.5},
     "TimeStretch": {"min_rate": 0.8, "max_rate": 1.25, "p": 0.5},
-    "PitchShift": {"min_semitones": -4, "max_semitones": 4, "p": 0.5},
-    "Shift": {"min_fraction": -0.5, "max_fraction": 0.5, "p": 0.5},
+    "PitchShift": {"min_semitones": -0.5, "max_semitones": 0.5, "p": 0.5},
+    "Shift": {"min_shift": -0.5, "max_shift": 0.5, "p": 0.5},
 }
 
 
@@ -30,12 +52,10 @@ def apply_augmentation(
             print(f"Invalid augmentation technique: {augmentation_name}")
             exit(1)
 
-    print("Augmentations: ", augmentation_list)
     augment = Compose(augmentation_list)
-
-    # for transform in augment.transforms:
-    #    print(f"{transform.__class__.__name__}: {transform.parameters}")
-
     augmented_samples = augment(samples=samples, sample_rate=sample_rate)
 
+    for transform in augment.transforms:
+        print(f"{transform.__class__.__name__}: {transform.parameters}")
+
     return augmented_samples
diff --git a/requirements.txt b/requirements.txt
index 8943971..9090607 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ nltk
 num2words
 tqdm
 gensim
+soundfile
 librosa
 pydub
 audiomentations
diff --git a/run_audio_aug.py b/run_audio_aug.py
index 024b9ec..61b55b5 100755
--- a/run_audio_aug.py
+++ b/run_audio_aug.py
@@ -2,7 +2,8 @@
 
 import argparse
 import os
-import audiomentations as AA
+import random
+import numpy as np
 
 from audio.aug import apply_augmentation
 from utils.files import load_audio, save_audio
@@ -32,12 +33,18 @@
         nargs="+",
         help="Audiomentation techniques (e.g., AddGaussianNoise, PitchShift, TimeStretch)",
     )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Random seed for reproducible augmentations (default: None, generates a random seed)",
+    )
 
     args = parser.parse_args()
 
-    if not os.path.exists(args.input_file):
-        print(f"Input file '{args.input_file}' does not exist.")
-        exit(1)
+    if args.seed is not None:
+        random.seed(args.seed)
+        np.random.seed(args.seed)
 
     audio, sr = load_audio(args.input_file)
 
diff --git a/utils/files.py b/utils/files.py
index 1876827..b77e773 100644
--- a/utils/files.py
+++ b/utils/files.py
@@ -7,7 +7,7 @@
 
 from pathlib import Path
 from pydub import AudioSegment
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Tuple
 from .text import pre_process_sentences
 
 
@@ -90,7 +90,7 @@ def download_and_extract(url: str, target_file: str) -> None:
             )
 
 
-def load_audio(audio_file: str) -> Union[tuple, None]:
+def load_audio(audio_file: str) -> Tuple[np.ndarray, float]:
     """
     Read an audio file using Librosa. Convert to WAV if not in WAV format.
 
@@ -115,7 +115,7 @@ def load_audio(audio_file: str) -> Union[tuple, None]:
         return y, sr
     except Exception as e:
         print(f"Error loading or converting the audio file: {e}")
-        return None
+        raise e
 
 
 def save_audio(