diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d811ce4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +data +data/* +data/*/* +tmp +tmp/* +profiler +profiler/* +profiler_env/* +core diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..555662c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +*.h5 filter=lfs diff=lfs merge=lfs -text +*.jpg filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index c4ff7bf..cf2eb87 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ - # Created by https://www.toptal.com/developers/gitignore/api/vim,python,pycharm+all,jupyternotebooks # Edit at https://www.toptal.com/developers/gitignore?templates=vim,python,pycharm+all,jupyternotebooks @@ -277,3 +276,5 @@ extended_prototype/saved_weights/* extended_prototype/soundbank/* extended_prototype/soundscapes/* extended_prototype/test_segmentations/* +saved_weights/* +experiments/* diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0ad422e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM tensorflow/tensorflow:2.4.0-gpu + +RUN apt-get update && apt-get install -y apt-transport-https +RUN apt-get install -y libtcmalloc-minimal4 +RUN apt-get install -y sox + +RUN apt install -y libsndfile1 +RUN apt install -y libsm6 libxext6 libxrender-dev + +RUN pip install --upgrade pip + +WORKDIR /tf + +RUN mkdir /assets + +COPY requirements.txt /assets/requirements.txt +RUN pip install -r /assets/requirements.txt --upgrade --no-cache-dir + +COPY . /tf/ + +RUN ./scripts/install.sh diff --git a/README.md b/README.md index 9066f3a..d39f9dc 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Soundscape Generation +Generate soundscapes from images. + ## Table of Contents 1. [Installation](#installation) @@ -17,7 +19,7 @@ Follow the instructions give in the following link: * [Scaper installation](https://scaper.readthedocs.io/en/latest/installation.html) -### Download Dependencies +### Install Dependencies ```bash pip install -r requirements.txt @@ -30,7 +32,7 @@ on [www.cityscapes-dataset.com](https://www.cityscapes-dataset.com/). After the script. During the download, it will ask you to provide your email and password for authentification. ```bash -./download_data.sh +./scripts/download_data.sh ``` ## Usage @@ -41,31 +43,35 @@ finetuned on the Cityscapes dataset. ### Train Object Segmentation Network -To train the network, run the follwing command. +To train the network, run the follwing command. The hyperparameters epoch and batch size can be configured in the `docker-compose.yml` file. To load a pre-trained model specify its path in the `MODEL_TO_LOAD` variable, if the variable is `None` the model is trained from scratch. ```bash -python train.py --num_epochs 70 --batch_size 8 --evaluate_every 1 --save_weights_every 1 +docker-compose up train_object_detection ``` -By default, training resumes from the latest saved checkpoint. If the `checkpoints/` directory is missing, the training -starts from scratch. - ### Test the Segmentation Network -Run the following command to predict the semantic segmentation of every image in the `test_images/` directory (note: -results are saved in the `test_segmentations/` directory) +Run the following command to predict the semantic segmentation of every image in the `--test_images` directory (note: +predictions are saved with the same name and a `_pred.jpg` suffix). Ensure that you specify the correct image's file type in `--test_images_type`. ```bash -python predict.py +docker-compose up predict_object_detection ``` -Ensure that you specify the image's file type in the image path variable in `predict.py`. +### Evaluate the Segmentation Network +To evaluate the segmentation network run the command below. + +```bash +docker-compose up evaluation +``` ### Generate soundscapes -Run the file soundGeneration.py to generate soundscapes of every image in the `test_images/` directory (note: results -are saved in the `soundscapes/` directory). Ensure that you specify the image type of the image in the image path -variable of `predict.py`. +To generate soundscapes of every image in the `--test_images` directory run the following command. The generated audios will be saved in `data/soundscapes`. Ensure that you specify the correct image's file type in `--test_images_type`. + +```bash +docker-compose up sound_generation +``` ## Results diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..75815f3 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,71 @@ +version: "3.2" +services: + train_object_detection: + build: . + volumes: + - ${PWD}:/tf/ + working_dir: /tf + command: bash -c "./scripts/train_object_detection.sh" + devices: + - /dev/nvidia0 + environment: + NVIDIA_VISIBLE_DEVICES: 0 + EPOCHS: 70 + BATCH_SIZE: 8 + MODEL_TO_LOAD: None + deploy: + resources: + reservations: + devices: + - capabilities: [ gpu ] + + predict_object_detection: + build: . + volumes: + - ${PWD}:/tf/ + working_dir: /tf + command: bash -c "./scripts/predict_object_detection.sh" + devices: + - /dev/nvidia0 + environment: + NVIDIA_VISIBLE_DEVICES: 0 + WEIGHTS_PATH: None + deploy: + resources: + reservations: + devices: + - capabilities: [ gpu ] + + sound_generation: + build: . + volumes: + - ${PWD}:/tf/ + working_dir: /tf + command: bash -c "./scripts/sound_generation.sh" + devices: + - /dev/nvidia0 + environment: + NVIDIA_VISIBLE_DEVICES: 0 + WEIGHTS_PATH: None + deploy: + resources: + reservations: + devices: + - capabilities: [ gpu ] + + evaluation: + build: . + volumes: + - ${PWD}:/tf/ + working_dir: /tf + command: bash -c "./scripts/evaluation.sh" + devices: + - /dev/nvidia0 + environment: + NVIDIA_VISIBLE_DEVICES: 0 + WEIGHTS_PATH: None + deploy: + resources: + reservations: + devices: + - capabilities: [ gpu ] diff --git a/experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5 b/experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5 new file mode 100644 index 0000000..e2f3ec8 --- /dev/null +++ b/experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3addb8d58982ca0362c6e9cfccdd19bac0b40f85df54ff6f924da0e261c91fef +size 8545920 diff --git a/requirements.txt b/requirements.txt index 17f1f3e..d826c03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,13 @@ -python==3.8.0 -numpy==1.19.5 -pandas==1.2.4 -scikit-learn==0.24.2 -matplotlib==3.4.2 -scipy==1.6.3 -sox==1.4.0 -ffmpeg==4.3.1 -ffmpeg-python==0.2.0 +numpy +pandas +scikit-learn +matplotlib +scipy +Pillow +sox +ffmpeg +ffmpeg-python scaper==1.6.5 -tensorflow==2.5.0 -cudatoolkit==11.0.221 -tensorflow-addons==0.13.0 -pillow==7.1.2 -cityscapesscripts==2.2.0 \ No newline at end of file +tensorflow>=2.5.0 +tensorflow-addons>=0.13.0 +cityscapesscripts==2.2.0 diff --git a/download_data.sh b/scripts/download_data.sh similarity index 100% rename from download_data.sh rename to scripts/download_data.sh diff --git a/scripts/evaluation.sh b/scripts/evaluation.sh new file mode 100755 index 0000000..094ac68 --- /dev/null +++ b/scripts/evaluation.sh @@ -0,0 +1 @@ +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.evaluation --weights $WEIGHTS_PATH diff --git a/scripts/install.sh b/scripts/install.sh new file mode 100755 index 0000000..2f828b0 --- /dev/null +++ b/scripts/install.sh @@ -0,0 +1,10 @@ +apt-get update + +apt-get -y install git +apt-get -y install libav-tools +apt-get -y install libsndfile1-dev +apt-get -y install libsndfile1 +apt-get -y install libcupti-dev + +pip install --upgrade pip +pip install -r requirements.txt diff --git a/scripts/predict_object_detection.sh b/scripts/predict_object_detection.sh new file mode 100755 index 0000000..d62dbd6 --- /dev/null +++ b/scripts/predict_object_detection.sh @@ -0,0 +1 @@ +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.predict --weights $WEIGHTS_PATH diff --git a/scripts/sound_generation.sh b/scripts/sound_generation.sh new file mode 100755 index 0000000..e73960f --- /dev/null +++ b/scripts/sound_generation.sh @@ -0,0 +1 @@ +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.sound_generation --weights $WEIGHTS_PATH diff --git a/scripts/train_object_detection.sh b/scripts/train_object_detection.sh new file mode 100755 index 0000000..ebeb702 --- /dev/null +++ b/scripts/train_object_detection.sh @@ -0,0 +1 @@ +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.train --num_epochs $EPOCHS --batch_size $BATCH_SIZE --model_to_load $MODEL_TO_LOAD diff --git a/setup.py b/setup.py index c3ce3f8..b300934 100644 --- a/setup.py +++ b/setup.py @@ -29,8 +29,8 @@ def parse_requirements(filename): README_MD = open(join(dirname(abspath(__file__)), "README.md")).read() -setup(name='soundscape-generation', - version=get_version('soundscape-generation/__init__.py'), +setup(name='soundscape_generation', + version=get_version('soundscape_generation/__init__.py'), author='ABIZ Lab', author_email='abiz@hslu.ch', description='Generate soundscapes based on images.', diff --git a/soundscape-generation/eval/evaluation.py b/soundscape-generation/eval/evaluation.py deleted file mode 100644 index 72a32fd..0000000 --- a/soundscape-generation/eval/evaluation.py +++ /dev/null @@ -1,47 +0,0 @@ -import tensorflow as tf - - -def compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels, num_classes): - # y_true_labels: (val_batch_size, img_h, img_w) - # y_pred_labels: (val_batch_size, img_h, img_w) - - batch_intersection, batch_union = [], [] # for each class, store the sum of intersections and unions in the batch - - for class_label in range(num_classes - 1): # ignore class 'other' - true_equal_class = tf.cast(tf.equal(y_true_labels, class_label), tf.int32) - pred_equal_class = tf.cast(tf.equal(y_pred_labels, class_label), tf.int32) - - intersection = tf.reduce_sum(tf.multiply(true_equal_class, pred_equal_class)) # TP (true positives) - union = tf.reduce_sum(true_equal_class) + tf.reduce_sum( - pred_equal_class) - intersection # TP + FP + FN = (TP + FP) + (TP + FN) - TP - - batch_intersection.append(intersection) - batch_union.append(union) - - return tf.cast(tf.stack(batch_intersection, axis=0), tf.int64), tf.cast(tf.stack(batch_union, axis=0), - tf.int64) # (19,) - - -def evaluate(dataset, network, val_batch_size, image_size): - # Compute IoU on validation set (IoU = Intersection / Union) - - total_intersection = tf.zeros((19), tf.int64) - total_union = tf.zeros((19), tf.int64) - - print('Evaluating on validation set...') - num_val_batches = dataset.num_val_images // val_batch_size - for batch in range(num_val_batches): - x, y_true_labels = dataset.get_validation_batch(batch, val_batch_size, image_size) - - y_pred_logits = network(x, is_training=False) - y_pred_labels = tf.math.argmax(y_pred_logits, axis=-1, output_type=tf.int32) - - batch_intersection, batch_union = compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels, - dataset.num_classes) - total_intersection += batch_intersection - total_union += batch_union - - iou_per_class = tf.divide(total_intersection, total_union) # IoU for each of the 19 classes - iou_mean = tf.reduce_mean(iou_per_class) # Mean IoU over the 19 classes - - return iou_per_class, iou_mean diff --git a/soundscape-generation/sound_generation.py b/soundscape-generation/sound_generation.py deleted file mode 100644 index 0791af5..0000000 --- a/soundscape-generation/sound_generation.py +++ /dev/null @@ -1,130 +0,0 @@ -import argparse -import numpy as np -import os -import scaper - -import predict - -""" Constansts for Scaper """ -# OUTPUT FOLDER -OUTFOLDER = "soundscapes/" -# SCAPER SETTINGS -FG_FOLDER = "soundbank/foreground" -BG_FOLDER = "soundbank/background" -REF_DB = -3 # Difference between background and foreground DB -DURATION = 30.0 - -MIN_EVENTS = 4 -MAX_EVENTS = 7 # 10 Objects with sound - 3 Background = 7 - -EVENT_TIME_DIST = 'normal' -EVENT_TIME_MEAN = 20 -EVENT_TIME_STD = 9 - -SOURCE_TIME_DIST = 'const' -SOURCE_TIME = 0.0 - -EVENT_DURATION_DIST = 'uniform' -EVENT_DURATION_MIN = 12 -EVENT_DURATION_MAX = 16 - -SNR_DIST = 'uniform' # the signal-to-noise ratio (in LUFS) compared to the background (DB Difference). -SNR_MIN = 3 -SNR_MAX = 5 - -PITCH_DIST = 'uniform' -PITCH_MIN = -0.2 -PITCH_MAX = 0.2 - -TIME_STRETCH_DIST = 'uniform' -TIME_STRETCH_MIN = 0.5 -TIME_STRETCH_MAX = 1.0 - -SEED = 123 # Generate a random seed for this Scaper object - - -class SoundGenerator: - - def __init__(self, foreground_sounds, background_sounds, image_names): - # Initialisation of Scaper and Object-Detection Container - self.sc = scaper.Scaper(DURATION, FG_FOLDER, BG_FOLDER, random_state=SEED) - self.sc.protected_labels = [] - self.sc.ref_db = REF_DB - self.detected_foreground_sounds = foreground_sounds - self.detected_background_sounds = background_sounds - self.image_names = image_names - - # Generate 2 soundscapes using a truncated normal distribution of start times - def generate_sound(self, n_soundscapes): - for i in range(len(self.image_names)): - image_name = self.image_names[i] - fg_sound = self.detected_foreground_sounds[i] - bg_sound = self.detected_background_sounds[i] - all_foreground_sounds_list = os.listdir(FG_FOLDER) - all_background_sounds_list = os.listdir(BG_FOLDER) - final_fg_sound = [x for x in fg_sound if x in all_foreground_sounds_list] - final_bg_sound = [x for x in bg_sound if x in all_background_sounds_list] - - for n in range(len(n_soundscapes)): - - print('Generating soundscape: {:d}/{:d}'.format(n + 1, len(n_soundscapes))) - - # reset the event specifications for foreground and background at the - # beginning of each loop to clear all previously added events - self.sc.reset_bg_event_spec() - self.sc.reset_fg_event_spec() - - # add background - self.sc.add_background(label=('choose', final_bg_sound), - source_file=('choose', []), - source_time=('normal', 20, 8)) - - # add random number of foreground events - n_events = np.random.randint(MIN_EVENTS, MAX_EVENTS + 1) - for _ in range(n_events): - self.sc.add_event(label=('choose', final_fg_sound), - source_file=('choose', []), - source_time=(SOURCE_TIME_DIST, SOURCE_TIME), - event_time=(EVENT_TIME_DIST, EVENT_TIME_MEAN, EVENT_TIME_STD), - event_duration=(EVENT_DURATION_DIST, EVENT_DURATION_MIN, EVENT_DURATION_MAX), - snr=(SNR_DIST, SNR_MIN, SNR_MAX), - pitch_shift=(None), - time_stretch=(None)) - - # generate - audiofile = os.path.join(OUTFOLDER, "{}_soundscape_number_{:d}.wav".format(image_name, n + 1)) - txtfile = os.path.join(OUTFOLDER, "{}_soundscape_number_{:d}.txt".format(image_name, n + 1)) - - self.sc.generate(audiofile, - allow_repeated_label=True, - allow_repeated_source=True, - reverb=0.1, - disable_sox_warnings=True, - no_audio=False, - txt_path=txtfile, - peak_normalization=True, - disable_instantiation_warnings=True - ) - - print("Path to output folder: {}".format(OUTFOLDER)) - - -if __name__ == "__main__": - os.chdir("extended_prototype") - - parser = argparse.ArgumentParser() - - parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing') - parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing') - parser.add_argument('--weights', type=str, default="pretrained/pretrained.h5", - help='Relative path of network weights') - - args = parser.parse_args() - foreground_objects, background_objects = predict.main(args) - image_names = list(foreground_objects.keys()) - foreground_objects_list = list(foreground_objects.values()) - background_objects_list = list(background_objects.values()) - soundscape_generator = SoundGenerator(foreground_sounds=foreground_objects_list, - background_sounds=background_objects_list, image_names=image_names) - number_soundscapes_per_image = range(0, 3) - soundscape_generator.generate_sound(number_soundscapes_per_image) diff --git a/soundscape-generation/__init__.py b/soundscape_generation/__init__.py similarity index 61% rename from soundscape-generation/__init__.py rename to soundscape_generation/__init__.py index f2799a3..b466289 100644 --- a/soundscape-generation/__init__.py +++ b/soundscape_generation/__init__.py @@ -1,7 +1,7 @@ """ -soundscape-generation. +soundscape_generation. Generate soundscapes based on images. """ -__version__ = '0.1.0' +__version__ = '0.1.1' __author__ = 'ABIZ Lab' diff --git a/soundscape-generation/dataset/__init__.py b/soundscape_generation/dataset/__init__.py similarity index 100% rename from soundscape-generation/dataset/__init__.py rename to soundscape_generation/dataset/__init__.py diff --git a/soundscape-generation/dataset/cityscapes.py b/soundscape_generation/dataset/cityscapes.py similarity index 98% rename from soundscape-generation/dataset/cityscapes.py rename to soundscape_generation/dataset/cityscapes.py index 48a87d1..b3b8a06 100644 --- a/soundscape-generation/dataset/cityscapes.py +++ b/soundscape_generation/dataset/cityscapes.py @@ -3,11 +3,12 @@ import random import tensorflow as tf -from utils import read_image, read_segmentation +from soundscape_generation.utils.utils import read_image, read_segmentation class CityscapesDataset: def __init__(self): + self.name = 'Cityscapes' self.image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'data', 'images', 'train', '*', '*.png'))) self.segmentation_paths = sorted( glob.glob(os.path.join(os.getcwd(), 'data', 'segmentations', 'train', '*', '*labelIds.png'))) diff --git a/soundscape-generation/eval/__init__.py b/soundscape_generation/eval/__init__.py similarity index 100% rename from soundscape-generation/eval/__init__.py rename to soundscape_generation/eval/__init__.py diff --git a/soundscape-generation/soundscapes_evaluation.py b/soundscape_generation/eval/evaluation.py similarity index 70% rename from soundscape-generation/soundscapes_evaluation.py rename to soundscape_generation/eval/evaluation.py index ac08a3a..ab469d1 100644 --- a/soundscape-generation/soundscapes_evaluation.py +++ b/soundscape_generation/eval/evaluation.py @@ -1,66 +1,53 @@ -import argparse -import glob import numpy as np -import os import tensorflow as tf -import time -from ERFNet import ERFNet -from cityscapes import CityscapesDataset -from utils import read_image -from evaluation import evaluate +from soundscape_generation.utils.utils import read_image -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +def compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels, num_classes): + # y_true_labels: (val_batch_size, img_h, img_w) + # y_pred_labels: (val_batch_size, img_h, img_w) -def main(args): - img_h, img_w = args.img_height, args.img_width - val_batch_size = args.val_batch_size - is_validation_set = args.is_validation_set + batch_intersection, batch_union = [], [] # for each class, store the sum of intersections and unions in the batch - if (is_validation_set): - image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'test_images', '*.png'))) - else: - image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'test_images', '*.jpg'))) - - own_test_set_true = [ - [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0], - [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0], - [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], - [1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1], - [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0], - [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1], - ] - - dataset = CityscapesDataset() - - print('Creating network and loading weights...') - network = ERFNet(dataset.num_classes) - - # Initialize network weights - inp_test = tf.random.normal(shape=(1, img_h, img_w, 3)) - out_test = network(inp_test, is_training=False) - print('Shape of network\'s output:', out_test.shape) - - # Load weights and images from given paths - weights_path = os.path.join(os.getcwd(), args.weights) - network.load_weights(weights_path) - print('Weights from {} loaded correctly.'.format(weights_path)) - # iou_per_class, iou_mean = evaluate(dataset, network, val_batch_size, (img_h, img_w)) - # print("iou_per_class: {}, iou_mean: {}".format(iou_per_class, iou_mean)) - get_total_percision(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true, - image_paths) - get_total_recall(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true, - image_paths) + for class_label in range(num_classes - 1): # ignore class 'other' + true_equal_class = tf.cast(tf.equal(y_true_labels, class_label), tf.int32) + pred_equal_class = tf.cast(tf.equal(y_pred_labels, class_label), tf.int32) + + intersection = tf.reduce_sum(tf.multiply(true_equal_class, pred_equal_class)) # TP (true positives) + union = tf.reduce_sum(true_equal_class) + tf.reduce_sum( + pred_equal_class) - intersection # TP + FP + FN = (TP + FP) + (TP + FN) - TP + + batch_intersection.append(intersection) + batch_union.append(union) + + return tf.cast(tf.stack(batch_intersection, axis=0), tf.int64), tf.cast(tf.stack(batch_union, axis=0), + tf.int64) # (19,) + + +def evaluate(dataset, network, val_batch_size, image_size): + # Compute IoU on validation set (IoU = Intersection / Union) + + total_intersection = tf.zeros((19), tf.int64) + total_union = tf.zeros((19), tf.int64) + + print('Evaluating on validation set...') + num_val_batches = dataset.num_val_images // val_batch_size + for batch in range(num_val_batches): + x, y_true_labels = dataset.get_validation_batch(batch, val_batch_size, image_size) + + y_pred_logits = network(x, is_training=False) + y_pred_labels = tf.math.argmax(y_pred_logits, axis=-1, output_type=tf.int32) + + batch_intersection, batch_union = compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels, + dataset.num_classes) + total_intersection += batch_intersection + total_union += batch_union + + iou_per_class = tf.divide(total_intersection, total_union) # IoU for each of the 19 classes + iou_mean = tf.reduce_mean(iou_per_class) # Mean IoU over the 19 classes + + return iou_per_class, iou_mean def get_total_percision(dataset, network, val_batch_size, image_size, is_validation_set, own_test_set_true, @@ -85,9 +72,9 @@ def get_total_percision(dataset, network, val_batch_size, image_size, is_validat print('Total Precistion on validation set is {}'.format(total_set_precision)) return total_set_precision else: - print() test_set_true_counter = 0 for image_path in image_paths: + print('-'*20 + image_path + '-'*20) image = read_image(image_path, image_size) x = tf.expand_dims(image, axis=0) y_pred_logits = network(x, is_training=False) # (1, img_h, img_w, num_classes) @@ -98,9 +85,9 @@ def get_total_percision(dataset, network, val_batch_size, image_size, is_validat total_tp_and_fp += tp_and_fp_batch batchprecision = tf.divide(tp_batch, tp_and_fp_batch) test_set_true_counter += 1 - print('Precistion from image {}: {}.'.format(image_path, batchprecision)) - print() + print('Precistion: {}'.format(batchprecision)) total_set_precision = tf.divide(total_tp, total_tp_and_fp) + print('-'*20 + 'TOTAL PRECISION' + '-'*20) print('Total Precistion on own test set is {}'.format(total_set_precision)) return total_set_precision @@ -176,9 +163,9 @@ def get_total_recall(dataset, network, val_batch_size, image_size, is_validation print('Total Recall on validation set is {}'.format(total_set_recall)) return total_set_recall else: - print() test_set_true_counter = 0 for image_path in image_paths: + print('-'*20 + image_path + '-'*20) image = read_image(image_path, image_size) x = tf.expand_dims(image, axis=0) y_pred_logits = network(x, is_training=False) # (1, img_h, img_w, num_classes) @@ -189,9 +176,9 @@ def get_total_recall(dataset, network, val_batch_size, image_size, is_validation total_tp_and_fn += tp_and_fn_batch batchrecall = tf.divide(tp_batch, tp_and_fn_batch) test_set_true_counter += 1 - print('Recall from image {}: {}.'.format(image_path, batchrecall)) - print() + print('Recall: {}'.format(batchrecall)) total_set_recall = tf.divide(total_tp, total_tp_and_fn) + print('-'*20 + 'TOTAL RECALL' + '-'*20) print('Total Recall on own test set is {}'.format(total_set_recall)) return total_set_recall @@ -244,18 +231,3 @@ def get_recall_in_batch(y_true, y_pred, num_classes, is_validation_set): tp_batch.append(tp) tp_and_fn_batch.append(tp + fn) return tp_batch, tp_and_fn_batch - - -if __name__ == '__main__': - os.chdir("extended_prototype") - - parser = argparse.ArgumentParser() - parser.add_argument('--is_validation_set', type=bool, default=False, help='Evaluation on validation or own dataset') - parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing') - parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing') - parser.add_argument('--val_batch_size', type=int, default=1, help='Batch size for validation') - parser.add_argument('--weights', type=str, default="pretrained/pretrained.h5", - help='Relative path of network weights') - args = parser.parse_args() - - main(args) diff --git a/soundscape_generation/evaluation.py b/soundscape_generation/evaluation.py new file mode 100644 index 0000000..b3e6ff4 --- /dev/null +++ b/soundscape_generation/evaluation.py @@ -0,0 +1,79 @@ +import argparse +import glob +import os +import tensorflow as tf +from soundscape_generation.models.ERFNet import ERFNet +from soundscape_generation.dataset.cityscapes import CityscapesDataset +from soundscape_generation.eval.evaluation import evaluate, get_total_recall, get_total_percision + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +DEFAULT_MODEL = "experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5" + + +def main(args): + img_h, img_w = args.img_height, args.img_width + val_batch_size = args.val_batch_size + is_validation_set = args.is_validation_set + + + image_paths = sorted(glob.glob(os.path.join(os.getcwd(), args.test_images, '*[!_pred].{}'.format(args.test_images_type)))) # Specify Image file type + + own_test_set_true = [ + [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0], + [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], + [1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1], + [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1], + ] + + dataset = CityscapesDataset() + + print('Creating network and loading weights...') + network = ERFNet(dataset.num_classes) + + # Initialize network weights + inp_test = tf.random.normal(shape=(1, img_h, img_w, 3)) + out_test = network(inp_test, is_training=False) + print('Shape of network\'s output:', out_test.shape) + + # Load weights and images from given paths + if eval(args.weights) is None: + args.weights = DEFAULT_MODEL + weights_path = os.path.join(os.getcwd(), args.weights) + network.load_weights(weights_path) + print('Weights from {} loaded correctly.'.format(weights_path)) + + print('*'*20 + 'IOU' + '*'*20) + iou_per_class, iou_mean = evaluate(dataset, network, val_batch_size, (img_h, img_w)) + print("iou_per_class: {}\niou_mean: {}".format(iou_per_class, iou_mean)) + + print('*'*20 + 'PRECISION' + '*'*20) + get_total_percision(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true, image_paths) + + print('*'*20 + 'RECALL' + '*'*20) + get_total_recall(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true, image_paths) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--is_validation_set', type=bool, default=False, help='Evaluation on validation or own dataset') + parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing') + parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing') + parser.add_argument('--val_batch_size', type=int, default=1, help='Batch size for validation') + parser.add_argument('--weights', type=str, default="None", help='Relative path of network weights') + parser.add_argument('--test_images', type=str, default="data/test_images/", help='Relative path of the test images') + parser.add_argument('--test_images_type', type=str, default="jpg", help='Test image types') + args = parser.parse_args() + + main(args) diff --git a/soundscape_generation/generation/sound_generation.py b/soundscape_generation/generation/sound_generation.py new file mode 100644 index 0000000..138c59e --- /dev/null +++ b/soundscape_generation/generation/sound_generation.py @@ -0,0 +1,112 @@ +import numpy as np +import os +import scaper + +from soundscape_generation.utils.utils import create_folder + + +class SoundGenerator: + """ Constansts for Scaper """ + # OUTPUT FOLDER + OUTFOLDER = "data/generated_soundscapes/" + + # SCAPER SETTINGS + FG_FOLDER = "data/soundbank/foreground" + BG_FOLDER = "data/soundbank/background" + REF_DB = -3 # Difference between background and foreground DB + DURATION = 30.0 + + MIN_EVENTS = 4 + MAX_EVENTS = 7 # 10 Objects with sound - 3 Background = 7 + + EVENT_TIME_DIST = 'normal' + EVENT_TIME_MEAN = 20 + EVENT_TIME_STD = 9 + + SOURCE_TIME_DIST = 'const' + SOURCE_TIME = 0.0 + + EVENT_DURATION_DIST = 'uniform' + EVENT_DURATION_MIN = 12 + EVENT_DURATION_MAX = 16 + + SNR_DIST = 'uniform' # the signal-to-noise ratio (in LUFS) compared to the background (DB Difference). + SNR_MIN = 3 + SNR_MAX = 5 + + PITCH_DIST = 'uniform' + PITCH_MIN = -0.2 + PITCH_MAX = 0.2 + + TIME_STRETCH_DIST = 'uniform' + TIME_STRETCH_MIN = 0.5 + TIME_STRETCH_MAX = 1.0 + + SEED = 123 # Generate a random seed for this Scaper object + + def __init__(self, foreground_sounds, background_sounds, image_names): + # Initialisation of Scaper and Object-Detection Container + self.sc = scaper.Scaper(self.DURATION, self.FG_FOLDER, self.BG_FOLDER, random_state=self.SEED) + self.sc.protected_labels = [] + self.sc.ref_db = self.REF_DB + self.detected_foreground_sounds = foreground_sounds + self.detected_background_sounds = background_sounds + self.image_names = image_names + + # create the output folder + create_folder(os.path.join(os.getcwd(), self.OUTFOLDER)) + + # Generate 2 soundscapes using a truncated normal distribution of start times + def generate_sound(self, n_soundscapes): + print('*'*20 + 'START GENERATION' + '*'*20) + for i in range(len(self.image_names)): + image_name = self.image_names[i] + image_name = image_name.split('.')[0].strip() + print('-'*20 + 'GENERATION: {}'.format(image_name) + '-'*20) + + fg_sound = self.detected_foreground_sounds[i] + bg_sound = self.detected_background_sounds[i] + all_foreground_sounds_list = os.listdir(self.FG_FOLDER) + all_background_sounds_list = os.listdir(self.BG_FOLDER) + final_fg_sound = [x for x in fg_sound if x in all_foreground_sounds_list] + final_bg_sound = [x for x in bg_sound if x in all_background_sounds_list] + + for n in range(len(n_soundscapes)): + # reset the event specifications for foreground and background at the + # beginning of each loop to clear all previously added events + self.sc.reset_bg_event_spec() + self.sc.reset_fg_event_spec() + + # add background + self.sc.add_background(label=('choose', final_bg_sound), + source_file=('choose', []), + source_time=('normal', 20, 8)) + + # add random number of foreground events + n_events = np.random.randint(self.MIN_EVENTS, self.MAX_EVENTS + 1) + for _ in range(n_events): + self.sc.add_event(label=('choose', final_fg_sound), + source_file=('choose', []), + source_time=(self.SOURCE_TIME_DIST, self.SOURCE_TIME), + event_time=(self.EVENT_TIME_DIST, self.EVENT_TIME_MEAN, self.EVENT_TIME_STD), + event_duration=(self.EVENT_DURATION_DIST, self.EVENT_DURATION_MIN, self.EVENT_DURATION_MAX), + snr=(self.SNR_DIST, self.SNR_MIN, self.SNR_MAX), + pitch_shift=(None), + time_stretch=(None)) + + # define the output files + audiofile = os.path.join(self.OUTFOLDER, "{}_soundscape_number_{:d}.wav".format(image_name, n + 1)) + txtfile = os.path.join(self.OUTFOLDER, "{}_soundscape_number_{:d}.txt".format(image_name, n + 1)) + + # generate the soundscape + self.sc.generate(audiofile, + allow_repeated_label=True, + allow_repeated_source=True, + reverb=0.1, + disable_sox_warnings=True, + no_audio=False, + txt_path=txtfile, + peak_normalization=True, + disable_instantiation_warnings=True) + + print('Generated soundscape: {}'.format(audiofile)) diff --git a/soundscape-generation/loss/__init__.py b/soundscape_generation/loss/__init__.py similarity index 100% rename from soundscape-generation/loss/__init__.py rename to soundscape_generation/loss/__init__.py diff --git a/soundscape-generation/loss/losses.py b/soundscape_generation/loss/losses.py similarity index 100% rename from soundscape-generation/loss/losses.py rename to soundscape_generation/loss/losses.py diff --git a/soundscape-generation/models/ERFNet.py b/soundscape_generation/models/ERFNet.py similarity index 99% rename from soundscape-generation/models/ERFNet.py rename to soundscape_generation/models/ERFNet.py index d6c0320..fb3287c 100644 --- a/soundscape-generation/models/ERFNet.py +++ b/soundscape_generation/models/ERFNet.py @@ -114,6 +114,7 @@ def call(self, inp, is_training=True): class ERFNet(tf.keras.Model): def __init__(self, num_classes): super(ERFNet, self).__init__() + self.model_name = 'ERFNet' self.encoder = Encoder() self.decoder = Decoder(num_classes) diff --git a/soundscape-generation/models/__init__.py b/soundscape_generation/models/__init__.py similarity index 100% rename from soundscape-generation/models/__init__.py rename to soundscape_generation/models/__init__.py diff --git a/soundscape-generation/predict.py b/soundscape_generation/predict.py similarity index 77% rename from soundscape-generation/predict.py rename to soundscape_generation/predict.py index 7274cd5..578666d 100644 --- a/soundscape-generation/predict.py +++ b/soundscape_generation/predict.py @@ -5,25 +5,22 @@ import os import tensorflow as tf import time -from ERFNet import ERFNet from PIL import Image -from cityscapes import CityscapesDataset from operator import itemgetter -from utils import read_image +from soundscape_generation.models.ERFNet import ERFNet +from soundscape_generation.dataset.cityscapes import CityscapesDataset +from soundscape_generation.utils.utils import read_image os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' foreground_sounds_dict, background_sounds_dict = {}, {} +DEFAULT_MODEL = "experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5" def main(args): img_h_orig, img_w_orig = 1024, 2048 # original size of images in Cityscapes dataset img_h, img_w = args.img_height, args.img_width - if not os.path.exists('extended_prototype/test_segmentations'): - os.makedirs('extended_prototype/test_segmentations') - print('test_segmentations directory created.') - dataset = CityscapesDataset() print('Creating network and loading weights...') @@ -35,14 +32,18 @@ def main(args): print('Shape of network\'s output:', out_test.shape) # Load weights and images from given paths + if eval(args.weights) is None: + args.weights = DEFAULT_MODEL weights_path = os.path.join(os.getcwd(), args.weights) - image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'test_images', '*.png'))) # Specify Image file type + image_paths = sorted(glob.glob(os.path.join(os.getcwd(), args.test_images, '*[!_pred].{}'.format(args.test_images_type)))) # Specify Image file type network.load_weights(weights_path) print('Weights from {} loaded correctly.'.format(weights_path)) + print('*'*20 + 'START PREDICTION' + '*'*20) inference_times = [] for image_path in image_paths: + print('-'*20 + image_path + '-'*20) t0 = time.time() image = read_image(image_path, (img_h, img_w)) @@ -57,27 +58,24 @@ def main(args): t1 = time.time() # Save segmentation - save_path = image_path.replace('test_images', 'test_segmentations') + save_path = image_path.replace('.{}'.format(args.test_images_type), '_pred.{}'.format(args.test_images_type)) segmentation = Image.fromarray(y_pred_colors.numpy()) segmentation.save(save_path) - print() - print('Segmentation of image\n {}\nsaved in\n {}.'.format(image_path, save_path)) inference_times.append(t1 - t0) # Print detected_objects - print() _, tail = os.path.split(image_path) foreground_sounds, background_sounds = get_sounds(image_path, save_path, y_pred_labels, dataset) foreground_sounds_dict[tail] = foreground_sounds background_sounds_dict[tail] = background_sounds mean_inference_time = sum(inference_times) / len(inference_times) - print('\nAverage inference time: {:.3f} s'.format(mean_inference_time)) + print('-'*20 + 'PREDICTION STATS' + '-'*20) + print('Average inference time: {:.3f} s'.format(mean_inference_time)) return foreground_sounds_dict, background_sounds_dict def get_sounds(image_path, save_path, y_pred_labels, dataset): - print('Prediction of image\n{}'.format(image_path, save_path)) unique, counts = np.unique(y_pred_labels.numpy(), return_counts=True) detected_objects_dict = dict(zip(unique, counts)) topitems = heapq.nlargest(1, detected_objects_dict.items(), key=itemgetter(1)) @@ -101,13 +99,13 @@ def get_sounds(image_path, save_path, y_pred_labels, dataset): if __name__ == '__main__': - os.chdir("extended_prototype") parser = argparse.ArgumentParser() parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing') parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing') - parser.add_argument('--weights', type=str, default="pretrained/pretrained.h5", - help='Relative path of network weights') + parser.add_argument('--weights', type=str, default="None", help='Relative path of network weights') + parser.add_argument('--test_images', type=str, default="data/test_images/", help='Relative path of the test images') + parser.add_argument('--test_images_type', type=str, default="jpg", help='Test image types') args = parser.parse_args() main(args) diff --git a/soundscape_generation/sound_generation.py b/soundscape_generation/sound_generation.py new file mode 100644 index 0000000..5257f78 --- /dev/null +++ b/soundscape_generation/sound_generation.py @@ -0,0 +1,31 @@ +import argparse +import warnings + +import soundscape_generation.predict as predict +from soundscape_generation.generation.sound_generation import SoundGenerator + +warnings.filterwarnings(action='ignore') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing') + parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing') + parser.add_argument('--weights', type=str, default="None", help='Relative path of network weights') + parser.add_argument('--test_images', type=str, default="data/test_images/", help='Relative path of the test images') + parser.add_argument('--test_images_type', type=str, default="jpg", help='Test image types') + args = parser.parse_args() + + # predict the test images to get the objects in fore- and background + foreground_objects, background_objects = predict.main(args) + image_names = list(foreground_objects.keys()) + foreground_objects_list = list(foreground_objects.values()) + background_objects_list = list(background_objects.values()) + + # instantiate the sound generator + soundscape_generator = SoundGenerator(foreground_sounds=foreground_objects_list, + background_sounds=background_objects_list, + image_names=image_names) + # generate for each image 3 different soundscapes + number_soundscapes_per_image = range(0, 3) + soundscape_generator.generate_sound(number_soundscapes_per_image) diff --git a/soundscape-generation/train.py b/soundscape_generation/train.py similarity index 64% rename from soundscape-generation/train.py rename to soundscape_generation/train.py index a20100b..fcf34d8 100644 --- a/soundscape-generation/train.py +++ b/soundscape_generation/train.py @@ -1,33 +1,36 @@ import argparse -import math import numpy as np import os -import random import tensorflow as tf import tensorflow_addons as tfa import time -from ERFNet import ERFNet -from cityscapes import CityscapesDataset -from evaluation import evaluate -from losses import weighted_cross_entropy_loss +from soundscape_generation.models.ERFNet import ERFNet +from soundscape_generation.dataset.cityscapes import CityscapesDataset +from soundscape_generation.eval.evaluation import evaluate +from soundscape_generation.loss.losses import weighted_cross_entropy_loss +from soundscape_generation.utils.utils import create_folder_for_experiment, set_gpu_experimental_growth os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' def main(args): - if not os.path.exists('saved_weights'): - os.makedirs('saved_weights') - print('saved_weights directory created.') + set_gpu_experimental_growth() num_epochs = args.num_epochs batch_size = args.batch_size val_batch_size = args.val_batch_size img_h, img_w = args.img_height, args.img_width - dataset = CityscapesDataset() # create dataset - network = ERFNet(dataset.num_classes) # create network + # create dataset and model + dataset = CityscapesDataset() + network = ERFNet(dataset.num_classes) - # Initialize weights of the network + if eval(args.model_to_load) is None: + experiment_path = create_folder_for_experiment(network.model_name, dataset.name) + else: + experiment_path = os.path.join(os.getcwd(), args.model_to_load) + + # initialize weights of the network inp_test = tf.random.normal(shape=(batch_size, img_h, img_w, 3)) out_test = network(inp_test, is_training=False) print('Network created. Output shape: {}.'.format(out_test.shape)) @@ -37,14 +40,15 @@ def main(args): total_update_steps = num_epochs * num_batches_per_epoch print('Total update steps:', total_update_steps) - # Define optimizer (with learning rate schedule) + # define optimizer (with learning rate schedule) lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=8e-4, - decay_steps=total_update_steps, end_learning_rate=0.0, + decay_steps=total_update_steps, + end_learning_rate=0.0, power=0.9) opt = tfa.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=1e-4) - # Manage checkpoints (keep track of network weights, optimizer state, and last epoch id) - ckpt_path = os.path.join(os.getcwd(), 'checkpoints') + # manage checkpoints (keep track of network weights, optimizer state, and last epoch id) + ckpt_path = os.path.join(experiment_path, 'checkpoints') ckpt = tf.train.Checkpoint(network=network, opt=opt, last_saved_epoch=tf.Variable(-1)) ckpt_manager = tf.train.CheckpointManager(ckpt, ckpt_path, max_to_keep=3) @@ -57,54 +61,36 @@ def main(args): initial_epoch = 0 print('No checkpoint restored. Training from scratch.') - @tf.function - def train_step(x, y_true_labels): - print('Tracing training step...') - - # Forward pass - with tf.GradientTape() as tape: - y_pred_logits = network(x) # (batch_size, img_h, img_w, num_classes) - loss = weighted_cross_entropy_loss(y_true_labels, y_pred_logits, dataset.class_weights) - - # Backward pass - grads = tape.gradient(loss, network.trainable_variables) - opt.apply_gradients(zip(grads, network.trainable_variables)) - - return loss - - # Training + # training start = time.time() for epoch in range(initial_epoch, num_epochs): dataset.shuffle_training_paths() for batch in range(num_batches_per_epoch): x, y_true_labels = dataset.get_training_batch(batch, batch_size, (img_h, img_w)) - loss = train_step(x, y_true_labels) + loss = train_step(network, dataset, opt, x, y_true_labels) - # Print information about the current batch + # print information about the current batch if (batch + 1) % args.print_every == 0: current_step = int(opt.iterations) current_lr = opt.learning_rate(current_step) elapsed_time = time.time() - start print('[Epoch {}/{}. Batch {}/{}]'.format(epoch + 1, num_epochs, batch + 1, num_batches_per_epoch), end=' ') - print( - 'Training batch loss: {:.2f}. Elapsed time: {:.1f} s. Schedule: (step {}, lr {:.1e}).'.format(loss, - elapsed_time, - current_step, - current_lr)) + print('Training batch loss: {:.2f}. Elapsed time: {:.1f} s. Schedule: (step {}, lr {:.1e}).'.format( + loss, elapsed_time, current_step, current_lr)) - # Save training checkpoint after each epoch + # save training checkpoint after each epoch ckpt.last_saved_epoch.assign_add(1) ckpt_save_path = ckpt_manager.save() print('Checkpoint saved at {}.'.format(ckpt_save_path)) - # Save current weights as a h5 file + # save current weights as a h5 file if (epoch + 1) % args.save_weights_every == 0: - h5_save_path = os.path.join(os.getcwd(), 'saved_weights', 'weights-{}.h5'.format(epoch + 1)) + h5_save_path = os.path.join(experiment_path, 'saved_weights', 'weights-{}.h5'.format(epoch + 1)) network.save_weights(h5_save_path) print('Weights saved at {}.'.format(h5_save_path)) - # Compute the IoU score on validation set + # compute the IoU score on validation set if (epoch + 1) % args.evaluate_every == 0: iou_per_class, iou_mean = evaluate(dataset, network, val_batch_size, (img_h, img_w)) print('Mean IoU: {:.4f}'.format(iou_mean)) @@ -112,6 +98,22 @@ def train_step(x, y_true_labels): print('IoU per class: {}'.format(iou_per_class.numpy())) +@tf.function +def train_step(model, dataset, opt, x, y_true_labels): + print('Tracing training step...') + + # forward pass + with tf.GradientTape() as tape: + y_pred_logits = model(x) # (batch_size, img_h, img_w, num_classes) + loss = weighted_cross_entropy_loss(y_true_labels, y_pred_logits, dataset.class_weights) + + # backward pass + grads = tape.gradient(loss, model.trainable_variables) + opt.apply_gradients(zip(grads, model.trainable_variables)) + + return loss + + if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -124,6 +126,7 @@ def train_step(x, y_true_labels): parser.add_argument('--print_every', type=int, default=5, help='Number of batches between batch logs') parser.add_argument('--evaluate_every', type=int, default=1, help='Number of epochs between evaluations') parser.add_argument('--save_weights_every', type=int, default=1, help='Number of epochs between saves') + parser.add_argument('--model_to_load', type=str, default=None, help='Path of the pre-trained model that should be loaded') args = parser.parse_args() main(args) diff --git a/soundscape-generation/utils/__init__.py b/soundscape_generation/utils/__init__.py similarity index 100% rename from soundscape-generation/utils/__init__.py rename to soundscape_generation/utils/__init__.py diff --git a/soundscape-generation/utils/utils.py b/soundscape_generation/utils/utils.py similarity index 55% rename from soundscape-generation/utils/utils.py rename to soundscape_generation/utils/utils.py index ad57dc7..77ad5a0 100644 --- a/soundscape-generation/utils/utils.py +++ b/soundscape_generation/utils/utils.py @@ -1,4 +1,46 @@ +import os import tensorflow as tf +from datetime import datetime + + +def create_folder(path): + """ + Creates a folder at a given path. + :param path: the folder path to create. + :return: the path of the newly created path. + """ + if not os.path.exists(path): + os.mkdir(path) + + return path + + +def create_folder_for_experiment(model_name, dataset_name): + # create experiment folder for current dataset + experiment_path = create_folder(os.path.join(os.getcwd(), 'experiments')) + experiment_path = create_folder(os.path.join(experiment_path, dataset_name)) + + # create current experiment folder + current_time = datetime.now().strftime("%Y%m%d-%H%M%S") + experiment_folder_name = "{0}-{1}".format(model_name, current_time) + experiment_path = create_folder(os.path.join(experiment_path, experiment_folder_name)) + print('Created experiment path at: {}'.format(experiment_path)) + + return experiment_path + + +def set_gpu_experimental_growth(): + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + try: + # Currently, memory growth needs to be the same across GPUs + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + logical_gpus = tf.config.experimental.list_logical_devices('GPU') + print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") + except RuntimeError as e: + # Memory growth must be set before GPUs have been initialized + print(e) def normalize_image(img): # map pixel intensities to float32 in [-1, 1]