diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..d811ce4
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,9 @@
+data
+data/*
+data/*/*
+tmp
+tmp/*
+profiler
+profiler/*
+profiler_env/*
+core
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..555662c
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,4 @@
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index c4ff7bf..cf2eb87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-
 # Created by https://www.toptal.com/developers/gitignore/api/vim,python,pycharm+all,jupyternotebooks
 # Edit at https://www.toptal.com/developers/gitignore?templates=vim,python,pycharm+all,jupyternotebooks
 
@@ -277,3 +276,5 @@ extended_prototype/saved_weights/*
 extended_prototype/soundbank/*
 extended_prototype/soundscapes/*
 extended_prototype/test_segmentations/*
+saved_weights/*
+experiments/*
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..0ad422e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,21 @@
+FROM tensorflow/tensorflow:2.4.0-gpu
+
+RUN apt-get update && apt-get install -y apt-transport-https
+RUN apt-get install -y libtcmalloc-minimal4
+RUN apt-get install -y sox
+
+RUN apt install -y libsndfile1
+RUN apt install -y libsm6 libxext6 libxrender-dev
+
+RUN pip install --upgrade pip
+
+WORKDIR /tf
+
+RUN mkdir /assets
+
+COPY requirements.txt /assets/requirements.txt
+RUN pip install -r /assets/requirements.txt --upgrade --no-cache-dir
+
+COPY . /tf/
+
+RUN ./scripts/install.sh
diff --git a/README.md b/README.md
index 9066f3a..d39f9dc 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Soundscape Generation
 
+Generate soundscapes from images.
+
 ## Table of Contents
 
 1. [Installation](#installation)
@@ -17,7 +19,7 @@ Follow the instructions give in the following link:
 
 * [Scaper installation](https://scaper.readthedocs.io/en/latest/installation.html)
 
-### Download Dependencies
+### Install Dependencies
 
 ```bash
 pip install -r requirements.txt
@@ -30,7 +32,7 @@ on [www.cityscapes-dataset.com](https://www.cityscapes-dataset.com/). After the
 script. During the download, it will ask you to provide your email and password for authentification.
 
 ```bash
-./download_data.sh
+./scripts/download_data.sh
 ```
 
 ## Usage
@@ -41,31 +43,35 @@ finetuned on the Cityscapes dataset.
 
 ### Train Object Segmentation Network
 
-To train the network, run the follwing command.
+To train the network, run the follwing command. The hyperparameters epoch and batch size can be configured in the `docker-compose.yml` file. To load a pre-trained model specify its path in the `MODEL_TO_LOAD` variable, if the variable is `None` the model is trained from scratch.
 
 ```bash
-python train.py --num_epochs 70 --batch_size 8 --evaluate_every 1 --save_weights_every 1
+docker-compose up train_object_detection
 ```
 
-By default, training resumes from the latest saved checkpoint. If the `checkpoints/` directory is missing, the training
-starts from scratch.
-
 ### Test the Segmentation Network
 
-Run the following command to predict the semantic segmentation of every image in the `test_images/` directory (note:
-results are saved in the `test_segmentations/` directory)
+Run the following command to predict the semantic segmentation of every image in the `--test_images` directory (note:
+predictions are saved with the same name and a `_pred.jpg` suffix). Ensure that you specify the correct image's file type in `--test_images_type`.
 
 ```bash
-python predict.py
+docker-compose up predict_object_detection
 ```
 
-Ensure that you specify the image's file type in the image path variable in `predict.py`.
+### Evaluate the Segmentation Network
+To evaluate the segmentation network run the command below.
+
+```bash
+docker-compose up evaluation
+```
 
 ### Generate soundscapes
 
-Run the file soundGeneration.py to generate soundscapes of every image in the `test_images/` directory (note: results
-are saved in the `soundscapes/` directory). Ensure that you specify the image type of the image in the image path
-variable of `predict.py`.
+To generate soundscapes of every image in the `--test_images` directory run the following command. The generated audios will be saved in `data/soundscapes`. Ensure that you specify the correct image's file type in `--test_images_type`.
+
+```bash
+docker-compose up sound_generation
+```
 
 ## Results
 
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..75815f3
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,71 @@
+version: "3.2"
+services:
+    train_object_detection:
+        build: .
+        volumes:
+            - ${PWD}:/tf/
+        working_dir: /tf
+        command: bash -c "./scripts/train_object_detection.sh"
+        devices:
+            - /dev/nvidia0
+        environment:
+            NVIDIA_VISIBLE_DEVICES: 0
+            EPOCHS: 70
+            BATCH_SIZE: 8
+            MODEL_TO_LOAD: None
+        deploy:
+            resources:
+                reservations:
+                    devices:
+                        - capabilities: [ gpu ]
+
+    predict_object_detection:
+        build: .
+        volumes:
+            - ${PWD}:/tf/
+        working_dir: /tf
+        command: bash -c "./scripts/predict_object_detection.sh"
+        devices:
+            - /dev/nvidia0
+        environment:
+            NVIDIA_VISIBLE_DEVICES: 0
+            WEIGHTS_PATH: None
+        deploy:
+            resources:
+                reservations:
+                    devices:
+                        - capabilities: [ gpu ]
+
+    sound_generation:
+        build: .
+        volumes:
+            - ${PWD}:/tf/
+        working_dir: /tf
+        command: bash -c "./scripts/sound_generation.sh"
+        devices:
+            - /dev/nvidia0
+        environment:
+            NVIDIA_VISIBLE_DEVICES: 0
+            WEIGHTS_PATH: None
+        deploy:
+            resources:
+                reservations:
+                    devices:
+                        - capabilities: [ gpu ]
+
+    evaluation:
+        build: .
+        volumes:
+            - ${PWD}:/tf/
+        working_dir: /tf
+        command: bash -c "./scripts/evaluation.sh"
+        devices:
+            - /dev/nvidia0
+        environment:
+            NVIDIA_VISIBLE_DEVICES: 0
+            WEIGHTS_PATH: None
+        deploy:
+            resources:
+                reservations:
+                    devices:
+                        - capabilities: [ gpu ]
diff --git a/experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5 b/experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5
new file mode 100644
index 0000000..e2f3ec8
--- /dev/null
+++ b/experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3addb8d58982ca0362c6e9cfccdd19bac0b40f85df54ff6f924da0e261c91fef
+size 8545920
diff --git a/requirements.txt b/requirements.txt
index 17f1f3e..d826c03 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,13 @@
-python==3.8.0
-numpy==1.19.5
-pandas==1.2.4
-scikit-learn==0.24.2
-matplotlib==3.4.2
-scipy==1.6.3 
-sox==1.4.0
-ffmpeg==4.3.1
-ffmpeg-python==0.2.0
+numpy
+pandas
+scikit-learn
+matplotlib
+scipy
+Pillow
+sox
+ffmpeg
+ffmpeg-python
 scaper==1.6.5
-tensorflow==2.5.0
-cudatoolkit==11.0.221
-tensorflow-addons==0.13.0
-pillow==7.1.2
-cityscapesscripts==2.2.0
\ No newline at end of file
+tensorflow>=2.5.0
+tensorflow-addons>=0.13.0
+cityscapesscripts==2.2.0
diff --git a/download_data.sh b/scripts/download_data.sh
similarity index 100%
rename from download_data.sh
rename to scripts/download_data.sh
diff --git a/scripts/evaluation.sh b/scripts/evaluation.sh
new file mode 100755
index 0000000..094ac68
--- /dev/null
+++ b/scripts/evaluation.sh
@@ -0,0 +1 @@
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.evaluation --weights $WEIGHTS_PATH
diff --git a/scripts/install.sh b/scripts/install.sh
new file mode 100755
index 0000000..2f828b0
--- /dev/null
+++ b/scripts/install.sh
@@ -0,0 +1,10 @@
+apt-get update
+
+apt-get -y install git
+apt-get -y install libav-tools
+apt-get -y install libsndfile1-dev
+apt-get -y install libsndfile1
+apt-get -y install libcupti-dev
+
+pip install --upgrade pip
+pip install -r requirements.txt
diff --git a/scripts/predict_object_detection.sh b/scripts/predict_object_detection.sh
new file mode 100755
index 0000000..d62dbd6
--- /dev/null
+++ b/scripts/predict_object_detection.sh
@@ -0,0 +1 @@
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.predict --weights $WEIGHTS_PATH
diff --git a/scripts/sound_generation.sh b/scripts/sound_generation.sh
new file mode 100755
index 0000000..e73960f
--- /dev/null
+++ b/scripts/sound_generation.sh
@@ -0,0 +1 @@
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.sound_generation --weights $WEIGHTS_PATH
diff --git a/scripts/train_object_detection.sh b/scripts/train_object_detection.sh
new file mode 100755
index 0000000..ebeb702
--- /dev/null
+++ b/scripts/train_object_detection.sh
@@ -0,0 +1 @@
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python -m soundscape_generation.train --num_epochs $EPOCHS --batch_size $BATCH_SIZE --model_to_load $MODEL_TO_LOAD
diff --git a/setup.py b/setup.py
index c3ce3f8..b300934 100644
--- a/setup.py
+++ b/setup.py
@@ -29,8 +29,8 @@ def parse_requirements(filename):
 
 README_MD = open(join(dirname(abspath(__file__)), "README.md")).read()
 
-setup(name='soundscape-generation',
-      version=get_version('soundscape-generation/__init__.py'),
+setup(name='soundscape_generation',
+      version=get_version('soundscape_generation/__init__.py'),
       author='ABIZ Lab',
       author_email='abiz@hslu.ch',
       description='Generate soundscapes based on images.',
diff --git a/soundscape-generation/eval/evaluation.py b/soundscape-generation/eval/evaluation.py
deleted file mode 100644
index 72a32fd..0000000
--- a/soundscape-generation/eval/evaluation.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import tensorflow as tf
-
-
-def compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels, num_classes):
-    # y_true_labels: (val_batch_size, img_h, img_w)
-    # y_pred_labels: (val_batch_size, img_h, img_w)
-
-    batch_intersection, batch_union = [], []  # for each class, store the sum of intersections and unions in the batch
-
-    for class_label in range(num_classes - 1):  # ignore class 'other'
-        true_equal_class = tf.cast(tf.equal(y_true_labels, class_label), tf.int32)
-        pred_equal_class = tf.cast(tf.equal(y_pred_labels, class_label), tf.int32)
-
-        intersection = tf.reduce_sum(tf.multiply(true_equal_class, pred_equal_class))  # TP (true positives)
-        union = tf.reduce_sum(true_equal_class) + tf.reduce_sum(
-            pred_equal_class) - intersection  # TP + FP + FN = (TP + FP) + (TP + FN) - TP
-
-        batch_intersection.append(intersection)
-        batch_union.append(union)
-
-    return tf.cast(tf.stack(batch_intersection, axis=0), tf.int64), tf.cast(tf.stack(batch_union, axis=0),
-                                                                            tf.int64)  # (19,)
-
-
-def evaluate(dataset, network, val_batch_size, image_size):
-    # Compute IoU on validation set (IoU = Intersection / Union)
-
-    total_intersection = tf.zeros((19), tf.int64)
-    total_union = tf.zeros((19), tf.int64)
-
-    print('Evaluating on validation set...')
-    num_val_batches = dataset.num_val_images // val_batch_size
-    for batch in range(num_val_batches):
-        x, y_true_labels = dataset.get_validation_batch(batch, val_batch_size, image_size)
-
-        y_pred_logits = network(x, is_training=False)
-        y_pred_labels = tf.math.argmax(y_pred_logits, axis=-1, output_type=tf.int32)
-
-        batch_intersection, batch_union = compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels,
-                                                                                  dataset.num_classes)
-        total_intersection += batch_intersection
-        total_union += batch_union
-
-    iou_per_class = tf.divide(total_intersection, total_union)  # IoU for each of the 19 classes
-    iou_mean = tf.reduce_mean(iou_per_class)  # Mean IoU over the 19 classes
-
-    return iou_per_class, iou_mean
diff --git a/soundscape-generation/sound_generation.py b/soundscape-generation/sound_generation.py
deleted file mode 100644
index 0791af5..0000000
--- a/soundscape-generation/sound_generation.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import argparse
-import numpy as np
-import os
-import scaper
-
-import predict
-
-""" Constansts for Scaper """
-# OUTPUT FOLDER
-OUTFOLDER = "soundscapes/"
-# SCAPER SETTINGS
-FG_FOLDER = "soundbank/foreground"
-BG_FOLDER = "soundbank/background"
-REF_DB = -3  # Difference between background and foreground DB
-DURATION = 30.0
-
-MIN_EVENTS = 4
-MAX_EVENTS = 7  # 10 Objects with sound - 3 Background = 7
-
-EVENT_TIME_DIST = 'normal'
-EVENT_TIME_MEAN = 20
-EVENT_TIME_STD = 9
-
-SOURCE_TIME_DIST = 'const'
-SOURCE_TIME = 0.0
-
-EVENT_DURATION_DIST = 'uniform'
-EVENT_DURATION_MIN = 12
-EVENT_DURATION_MAX = 16
-
-SNR_DIST = 'uniform'  # the signal-to-noise ratio (in LUFS) compared to the background (DB Difference).
-SNR_MIN = 3
-SNR_MAX = 5
-
-PITCH_DIST = 'uniform'
-PITCH_MIN = -0.2
-PITCH_MAX = 0.2
-
-TIME_STRETCH_DIST = 'uniform'
-TIME_STRETCH_MIN = 0.5
-TIME_STRETCH_MAX = 1.0
-
-SEED = 123  # Generate a random seed for this Scaper object
-
-
-class SoundGenerator:
-
-    def __init__(self, foreground_sounds, background_sounds, image_names):
-        # Initialisation of Scaper and Object-Detection Container
-        self.sc = scaper.Scaper(DURATION, FG_FOLDER, BG_FOLDER, random_state=SEED)
-        self.sc.protected_labels = []
-        self.sc.ref_db = REF_DB
-        self.detected_foreground_sounds = foreground_sounds
-        self.detected_background_sounds = background_sounds
-        self.image_names = image_names
-
-    # Generate 2 soundscapes using a truncated normal distribution of start times
-    def generate_sound(self, n_soundscapes):
-        for i in range(len(self.image_names)):
-            image_name = self.image_names[i]
-            fg_sound = self.detected_foreground_sounds[i]
-            bg_sound = self.detected_background_sounds[i]
-            all_foreground_sounds_list = os.listdir(FG_FOLDER)
-            all_background_sounds_list = os.listdir(BG_FOLDER)
-            final_fg_sound = [x for x in fg_sound if x in all_foreground_sounds_list]
-            final_bg_sound = [x for x in bg_sound if x in all_background_sounds_list]
-
-            for n in range(len(n_soundscapes)):
-
-                print('Generating soundscape: {:d}/{:d}'.format(n + 1, len(n_soundscapes)))
-
-                # reset the event specifications for foreground and background at the
-                # beginning of each loop to clear all previously added events
-                self.sc.reset_bg_event_spec()
-                self.sc.reset_fg_event_spec()
-
-                # add background
-                self.sc.add_background(label=('choose', final_bg_sound),
-                                       source_file=('choose', []),
-                                       source_time=('normal', 20, 8))
-
-                # add random number of foreground events
-                n_events = np.random.randint(MIN_EVENTS, MAX_EVENTS + 1)
-                for _ in range(n_events):
-                    self.sc.add_event(label=('choose', final_fg_sound),
-                                      source_file=('choose', []),
-                                      source_time=(SOURCE_TIME_DIST, SOURCE_TIME),
-                                      event_time=(EVENT_TIME_DIST, EVENT_TIME_MEAN, EVENT_TIME_STD),
-                                      event_duration=(EVENT_DURATION_DIST, EVENT_DURATION_MIN, EVENT_DURATION_MAX),
-                                      snr=(SNR_DIST, SNR_MIN, SNR_MAX),
-                                      pitch_shift=(None),
-                                      time_stretch=(None))
-
-                # generate
-                audiofile = os.path.join(OUTFOLDER, "{}_soundscape_number_{:d}.wav".format(image_name, n + 1))
-                txtfile = os.path.join(OUTFOLDER, "{}_soundscape_number_{:d}.txt".format(image_name, n + 1))
-
-                self.sc.generate(audiofile,
-                                 allow_repeated_label=True,
-                                 allow_repeated_source=True,
-                                 reverb=0.1,
-                                 disable_sox_warnings=True,
-                                 no_audio=False,
-                                 txt_path=txtfile,
-                                 peak_normalization=True,
-                                 disable_instantiation_warnings=True
-                                 )
-
-                print("Path to output folder: {}".format(OUTFOLDER))
-
-
-if __name__ == "__main__":
-    os.chdir("extended_prototype")
-
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing')
-    parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing')
-    parser.add_argument('--weights', type=str, default="pretrained/pretrained.h5",
-                        help='Relative path of network weights')
-
-    args = parser.parse_args()
-    foreground_objects, background_objects = predict.main(args)
-    image_names = list(foreground_objects.keys())
-    foreground_objects_list = list(foreground_objects.values())
-    background_objects_list = list(background_objects.values())
-    soundscape_generator = SoundGenerator(foreground_sounds=foreground_objects_list,
-                                          background_sounds=background_objects_list, image_names=image_names)
-    number_soundscapes_per_image = range(0, 3)
-    soundscape_generator.generate_sound(number_soundscapes_per_image)
diff --git a/soundscape-generation/__init__.py b/soundscape_generation/__init__.py
similarity index 61%
rename from soundscape-generation/__init__.py
rename to soundscape_generation/__init__.py
index f2799a3..b466289 100644
--- a/soundscape-generation/__init__.py
+++ b/soundscape_generation/__init__.py
@@ -1,7 +1,7 @@
 """
-soundscape-generation.
+soundscape_generation.
 Generate soundscapes based on images.
 """
 
-__version__ = '0.1.0'
+__version__ = '0.1.1'
 __author__ = 'ABIZ Lab'
diff --git a/soundscape-generation/dataset/__init__.py b/soundscape_generation/dataset/__init__.py
similarity index 100%
rename from soundscape-generation/dataset/__init__.py
rename to soundscape_generation/dataset/__init__.py
diff --git a/soundscape-generation/dataset/cityscapes.py b/soundscape_generation/dataset/cityscapes.py
similarity index 98%
rename from soundscape-generation/dataset/cityscapes.py
rename to soundscape_generation/dataset/cityscapes.py
index 48a87d1..b3b8a06 100644
--- a/soundscape-generation/dataset/cityscapes.py
+++ b/soundscape_generation/dataset/cityscapes.py
@@ -3,11 +3,12 @@
 import random
 import tensorflow as tf
 
-from utils import read_image, read_segmentation
+from soundscape_generation.utils.utils import read_image, read_segmentation
 
 
 class CityscapesDataset:
     def __init__(self):
+        self.name = 'Cityscapes'
         self.image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'data', 'images', 'train', '*', '*.png')))
         self.segmentation_paths = sorted(
             glob.glob(os.path.join(os.getcwd(), 'data', 'segmentations', 'train', '*', '*labelIds.png')))
diff --git a/soundscape-generation/eval/__init__.py b/soundscape_generation/eval/__init__.py
similarity index 100%
rename from soundscape-generation/eval/__init__.py
rename to soundscape_generation/eval/__init__.py
diff --git a/soundscape-generation/soundscapes_evaluation.py b/soundscape_generation/eval/evaluation.py
similarity index 70%
rename from soundscape-generation/soundscapes_evaluation.py
rename to soundscape_generation/eval/evaluation.py
index ac08a3a..ab469d1 100644
--- a/soundscape-generation/soundscapes_evaluation.py
+++ b/soundscape_generation/eval/evaluation.py
@@ -1,66 +1,53 @@
-import argparse
-import glob
 import numpy as np
-import os
 import tensorflow as tf
-import time
-from ERFNet import ERFNet
-from cityscapes import CityscapesDataset
-from utils import read_image
 
-from evaluation import evaluate
+from soundscape_generation.utils.utils import read_image
 
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
+def compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels, num_classes):
+    # y_true_labels: (val_batch_size, img_h, img_w)
+    # y_pred_labels: (val_batch_size, img_h, img_w)
 
-def main(args):
-    img_h, img_w = args.img_height, args.img_width
-    val_batch_size = args.val_batch_size
-    is_validation_set = args.is_validation_set
+    batch_intersection, batch_union = [], []  # for each class, store the sum of intersections and unions in the batch
 
-    if (is_validation_set):
-        image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'test_images', '*.png')))
-    else:
-        image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'test_images', '*.jpg')))
-
-    own_test_set_true = [
-        [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0],
-        [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0],
-        [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
-        [1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1],
-        [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
-        [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1],
-    ]
-
-    dataset = CityscapesDataset()
-
-    print('Creating network and loading weights...')
-    network = ERFNet(dataset.num_classes)
-
-    # Initialize network weights
-    inp_test = tf.random.normal(shape=(1, img_h, img_w, 3))
-    out_test = network(inp_test, is_training=False)
-    print('Shape of network\'s output:', out_test.shape)
-
-    # Load weights and images from given paths
-    weights_path = os.path.join(os.getcwd(), args.weights)
-    network.load_weights(weights_path)
-    print('Weights from {} loaded correctly.'.format(weights_path))
-    # iou_per_class, iou_mean = evaluate(dataset, network, val_batch_size, (img_h, img_w))
-    # print("iou_per_class: {}, iou_mean: {}".format(iou_per_class, iou_mean))
-    get_total_percision(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true,
-                        image_paths)
-    get_total_recall(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true,
-                     image_paths)
+    for class_label in range(num_classes - 1):  # ignore class 'other'
+        true_equal_class = tf.cast(tf.equal(y_true_labels, class_label), tf.int32)
+        pred_equal_class = tf.cast(tf.equal(y_pred_labels, class_label), tf.int32)
+
+        intersection = tf.reduce_sum(tf.multiply(true_equal_class, pred_equal_class))  # TP (true positives)
+        union = tf.reduce_sum(true_equal_class) + tf.reduce_sum(
+            pred_equal_class) - intersection  # TP + FP + FN = (TP + FP) + (TP + FN) - TP
+
+        batch_intersection.append(intersection)
+        batch_union.append(union)
+
+    return tf.cast(tf.stack(batch_intersection, axis=0), tf.int64), tf.cast(tf.stack(batch_union, axis=0),
+                                                                            tf.int64)  # (19,)
+
+
+def evaluate(dataset, network, val_batch_size, image_size):
+    # Compute IoU on validation set (IoU = Intersection / Union)
+
+    total_intersection = tf.zeros((19), tf.int64)
+    total_union = tf.zeros((19), tf.int64)
+
+    print('Evaluating on validation set...')
+    num_val_batches = dataset.num_val_images // val_batch_size
+    for batch in range(num_val_batches):
+        x, y_true_labels = dataset.get_validation_batch(batch, val_batch_size, image_size)
+
+        y_pred_logits = network(x, is_training=False)
+        y_pred_labels = tf.math.argmax(y_pred_logits, axis=-1, output_type=tf.int32)
+
+        batch_intersection, batch_union = compute_intersection_and_union_in_batch(y_true_labels, y_pred_labels,
+                                                                                  dataset.num_classes)
+        total_intersection += batch_intersection
+        total_union += batch_union
+
+    iou_per_class = tf.divide(total_intersection, total_union)  # IoU for each of the 19 classes
+    iou_mean = tf.reduce_mean(iou_per_class)  # Mean IoU over the 19 classes
+
+    return iou_per_class, iou_mean
 
 
 def get_total_percision(dataset, network, val_batch_size, image_size, is_validation_set, own_test_set_true,
@@ -85,9 +72,9 @@ def get_total_percision(dataset, network, val_batch_size, image_size, is_validat
         print('Total Precistion on validation set is {}'.format(total_set_precision))
         return total_set_precision
     else:
-        print()
         test_set_true_counter = 0
         for image_path in image_paths:
+            print('-'*20 + image_path + '-'*20)
             image = read_image(image_path, image_size)
             x = tf.expand_dims(image, axis=0)
             y_pred_logits = network(x, is_training=False)  # (1, img_h, img_w, num_classes)
@@ -98,9 +85,9 @@ def get_total_percision(dataset, network, val_batch_size, image_size, is_validat
             total_tp_and_fp += tp_and_fp_batch
             batchprecision = tf.divide(tp_batch, tp_and_fp_batch)
             test_set_true_counter += 1
-            print('Precistion from image {}: {}.'.format(image_path, batchprecision))
-            print()
+            print('Precistion: {}'.format(batchprecision))
         total_set_precision = tf.divide(total_tp, total_tp_and_fp)
+        print('-'*20 + 'TOTAL PRECISION' + '-'*20)
         print('Total Precistion on own test set is {}'.format(total_set_precision))
         return total_set_precision
 
@@ -176,9 +163,9 @@ def get_total_recall(dataset, network, val_batch_size, image_size, is_validation
         print('Total Recall on validation set is {}'.format(total_set_recall))
         return total_set_recall
     else:
-        print()
         test_set_true_counter = 0
         for image_path in image_paths:
+            print('-'*20 + image_path + '-'*20)
             image = read_image(image_path, image_size)
             x = tf.expand_dims(image, axis=0)
             y_pred_logits = network(x, is_training=False)  # (1, img_h, img_w, num_classes)
@@ -189,9 +176,9 @@ def get_total_recall(dataset, network, val_batch_size, image_size, is_validation
             total_tp_and_fn += tp_and_fn_batch
             batchrecall = tf.divide(tp_batch, tp_and_fn_batch)
             test_set_true_counter += 1
-            print('Recall from image {}: {}.'.format(image_path, batchrecall))
-            print()
+            print('Recall: {}'.format(batchrecall))
         total_set_recall = tf.divide(total_tp, total_tp_and_fn)
+        print('-'*20 + 'TOTAL RECALL' + '-'*20)
         print('Total Recall on own test set is {}'.format(total_set_recall))
         return total_set_recall
 
@@ -244,18 +231,3 @@ def get_recall_in_batch(y_true, y_pred, num_classes, is_validation_set):
     tp_batch.append(tp)
     tp_and_fn_batch.append(tp + fn)
     return tp_batch, tp_and_fn_batch
-
-
-if __name__ == '__main__':
-    os.chdir("extended_prototype")
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--is_validation_set', type=bool, default=False, help='Evaluation on validation or own dataset')
-    parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing')
-    parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing')
-    parser.add_argument('--val_batch_size', type=int, default=1, help='Batch size for validation')
-    parser.add_argument('--weights', type=str, default="pretrained/pretrained.h5",
-                        help='Relative path of network weights')
-    args = parser.parse_args()
-
-    main(args)
diff --git a/soundscape_generation/evaluation.py b/soundscape_generation/evaluation.py
new file mode 100644
index 0000000..b3e6ff4
--- /dev/null
+++ b/soundscape_generation/evaluation.py
@@ -0,0 +1,79 @@
+import argparse
+import glob
+import os
+import tensorflow as tf
+from soundscape_generation.models.ERFNet import ERFNet
+from soundscape_generation.dataset.cityscapes import CityscapesDataset
+from soundscape_generation.eval.evaluation import evaluate, get_total_recall, get_total_percision
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+DEFAULT_MODEL = "experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5"
+
+
+def main(args):
+    img_h, img_w = args.img_height, args.img_width
+    val_batch_size = args.val_batch_size
+    is_validation_set = args.is_validation_set
+
+
+    image_paths = sorted(glob.glob(os.path.join(os.getcwd(), args.test_images, '*[!_pred].{}'.format(args.test_images_type))))  # Specify Image file type
+
+    own_test_set_true = [
+        [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0],
+        [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0],
+        [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
+        [1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1],
+        [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
+        [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1],
+    ]
+
+    dataset = CityscapesDataset()
+
+    print('Creating network and loading weights...')
+    network = ERFNet(dataset.num_classes)
+
+    # Initialize network weights
+    inp_test = tf.random.normal(shape=(1, img_h, img_w, 3))
+    out_test = network(inp_test, is_training=False)
+    print('Shape of network\'s output:', out_test.shape)
+
+    # Load weights and images from given paths
+    if eval(args.weights) is None:
+        args.weights = DEFAULT_MODEL
+    weights_path = os.path.join(os.getcwd(), args.weights)
+    network.load_weights(weights_path)
+    print('Weights from {} loaded correctly.'.format(weights_path))
+
+    print('*'*20 + 'IOU' + '*'*20)
+    iou_per_class, iou_mean = evaluate(dataset, network, val_batch_size, (img_h, img_w))
+    print("iou_per_class: {}\niou_mean: {}".format(iou_per_class, iou_mean))
+
+    print('*'*20 + 'PRECISION' + '*'*20)
+    get_total_percision(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true, image_paths)
+
+    print('*'*20 + 'RECALL' + '*'*20)
+    get_total_recall(dataset, network, val_batch_size, (img_h, img_w), is_validation_set, own_test_set_true, image_paths)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--is_validation_set', type=bool, default=False, help='Evaluation on validation or own dataset')
+    parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing')
+    parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing')
+    parser.add_argument('--val_batch_size', type=int, default=1, help='Batch size for validation')
+    parser.add_argument('--weights', type=str, default="None", help='Relative path of network weights')
+    parser.add_argument('--test_images', type=str, default="data/test_images/", help='Relative path of the test images')
+    parser.add_argument('--test_images_type', type=str, default="jpg", help='Test image types')
+    args = parser.parse_args()
+
+    main(args)
diff --git a/soundscape_generation/generation/sound_generation.py b/soundscape_generation/generation/sound_generation.py
new file mode 100644
index 0000000..138c59e
--- /dev/null
+++ b/soundscape_generation/generation/sound_generation.py
@@ -0,0 +1,112 @@
+import numpy as np
+import os
+import scaper
+
+from soundscape_generation.utils.utils import create_folder
+
+
+class SoundGenerator:
+    """ Constansts for Scaper """
+    # OUTPUT FOLDER
+    OUTFOLDER = "data/generated_soundscapes/"
+
+    # SCAPER SETTINGS
+    FG_FOLDER = "data/soundbank/foreground"
+    BG_FOLDER = "data/soundbank/background"
+    REF_DB = -3  # Difference between background and foreground DB
+    DURATION = 30.0
+
+    MIN_EVENTS = 4
+    MAX_EVENTS = 7  # 10 Objects with sound - 3 Background = 7
+
+    EVENT_TIME_DIST = 'normal'
+    EVENT_TIME_MEAN = 20
+    EVENT_TIME_STD = 9
+
+    SOURCE_TIME_DIST = 'const'
+    SOURCE_TIME = 0.0
+
+    EVENT_DURATION_DIST = 'uniform'
+    EVENT_DURATION_MIN = 12
+    EVENT_DURATION_MAX = 16
+
+    SNR_DIST = 'uniform'  # the signal-to-noise ratio (in LUFS) compared to the background (DB Difference).
+    SNR_MIN = 3
+    SNR_MAX = 5
+
+    PITCH_DIST = 'uniform'
+    PITCH_MIN = -0.2
+    PITCH_MAX = 0.2
+
+    TIME_STRETCH_DIST = 'uniform'
+    TIME_STRETCH_MIN = 0.5
+    TIME_STRETCH_MAX = 1.0
+
+    SEED = 123  # Generate a random seed for this Scaper object
+
+    def __init__(self, foreground_sounds, background_sounds, image_names):
+        # Initialisation of Scaper and Object-Detection Container
+        self.sc = scaper.Scaper(self.DURATION, self.FG_FOLDER, self.BG_FOLDER, random_state=self.SEED)
+        self.sc.protected_labels = []
+        self.sc.ref_db = self.REF_DB
+        self.detected_foreground_sounds = foreground_sounds
+        self.detected_background_sounds = background_sounds
+        self.image_names = image_names
+
+        # create the output folder
+        create_folder(os.path.join(os.getcwd(), self.OUTFOLDER))
+
+    # Generate 2 soundscapes using a truncated normal distribution of start times
+    def generate_sound(self, n_soundscapes):
+        print('*'*20 + 'START GENERATION' + '*'*20)
+        for i in range(len(self.image_names)):
+            image_name = self.image_names[i]
+            image_name = image_name.split('.')[0].strip()
+            print('-'*20 + 'GENERATION: {}'.format(image_name) + '-'*20)
+
+            fg_sound = self.detected_foreground_sounds[i]
+            bg_sound = self.detected_background_sounds[i]
+            all_foreground_sounds_list = os.listdir(self.FG_FOLDER)
+            all_background_sounds_list = os.listdir(self.BG_FOLDER)
+            final_fg_sound = [x for x in fg_sound if x in all_foreground_sounds_list]
+            final_bg_sound = [x for x in bg_sound if x in all_background_sounds_list]
+
+            for n in range(len(n_soundscapes)):
+                # reset the event specifications for foreground and background at the
+                # beginning of each loop to clear all previously added events
+                self.sc.reset_bg_event_spec()
+                self.sc.reset_fg_event_spec()
+
+                # add background
+                self.sc.add_background(label=('choose', final_bg_sound),
+                                       source_file=('choose', []),
+                                       source_time=('normal', 20, 8))
+
+                # add random number of foreground events
+                n_events = np.random.randint(self.MIN_EVENTS, self.MAX_EVENTS + 1)
+                for _ in range(n_events):
+                    self.sc.add_event(label=('choose', final_fg_sound),
+                                      source_file=('choose', []),
+                                      source_time=(self.SOURCE_TIME_DIST, self.SOURCE_TIME),
+                                      event_time=(self.EVENT_TIME_DIST, self.EVENT_TIME_MEAN, self.EVENT_TIME_STD),
+                                      event_duration=(self.EVENT_DURATION_DIST, self.EVENT_DURATION_MIN, self.EVENT_DURATION_MAX),
+                                      snr=(self.SNR_DIST, self.SNR_MIN, self.SNR_MAX),
+                                      pitch_shift=(None),
+                                      time_stretch=(None))
+
+                # define the output files
+                audiofile = os.path.join(self.OUTFOLDER, "{}_soundscape_number_{:d}.wav".format(image_name, n + 1))
+                txtfile = os.path.join(self.OUTFOLDER, "{}_soundscape_number_{:d}.txt".format(image_name, n + 1))
+
+                # generate the soundscape
+                self.sc.generate(audiofile,
+                                 allow_repeated_label=True,
+                                 allow_repeated_source=True,
+                                 reverb=0.1,
+                                 disable_sox_warnings=True,
+                                 no_audio=False,
+                                 txt_path=txtfile,
+                                 peak_normalization=True,
+                                 disable_instantiation_warnings=True)
+
+                print('Generated soundscape: {}'.format(audiofile))
diff --git a/soundscape-generation/loss/__init__.py b/soundscape_generation/loss/__init__.py
similarity index 100%
rename from soundscape-generation/loss/__init__.py
rename to soundscape_generation/loss/__init__.py
diff --git a/soundscape-generation/loss/losses.py b/soundscape_generation/loss/losses.py
similarity index 100%
rename from soundscape-generation/loss/losses.py
rename to soundscape_generation/loss/losses.py
diff --git a/soundscape-generation/models/ERFNet.py b/soundscape_generation/models/ERFNet.py
similarity index 99%
rename from soundscape-generation/models/ERFNet.py
rename to soundscape_generation/models/ERFNet.py
index d6c0320..fb3287c 100644
--- a/soundscape-generation/models/ERFNet.py
+++ b/soundscape_generation/models/ERFNet.py
@@ -114,6 +114,7 @@ def call(self, inp, is_training=True):
 class ERFNet(tf.keras.Model):
     def __init__(self, num_classes):
         super(ERFNet, self).__init__()
+        self.model_name = 'ERFNet'
         self.encoder = Encoder()
         self.decoder = Decoder(num_classes)
 
diff --git a/soundscape-generation/models/__init__.py b/soundscape_generation/models/__init__.py
similarity index 100%
rename from soundscape-generation/models/__init__.py
rename to soundscape_generation/models/__init__.py
diff --git a/soundscape-generation/predict.py b/soundscape_generation/predict.py
similarity index 77%
rename from soundscape-generation/predict.py
rename to soundscape_generation/predict.py
index 7274cd5..578666d 100644
--- a/soundscape-generation/predict.py
+++ b/soundscape_generation/predict.py
@@ -5,25 +5,22 @@
 import os
 import tensorflow as tf
 import time
-from ERFNet import ERFNet
 from PIL import Image
-from cityscapes import CityscapesDataset
 from operator import itemgetter
-from utils import read_image
+from soundscape_generation.models.ERFNet import ERFNet
+from soundscape_generation.dataset.cityscapes import CityscapesDataset
+from soundscape_generation.utils.utils import read_image
 
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 foreground_sounds_dict, background_sounds_dict = {}, {}
+DEFAULT_MODEL = "experiments/Cityscapes/ERFNet-Pretrained/pretrained.h5"
 
 
 def main(args):
     img_h_orig, img_w_orig = 1024, 2048  # original size of images in Cityscapes dataset
     img_h, img_w = args.img_height, args.img_width
 
-    if not os.path.exists('extended_prototype/test_segmentations'):
-        os.makedirs('extended_prototype/test_segmentations')
-        print('test_segmentations directory created.')
-
     dataset = CityscapesDataset()
 
     print('Creating network and loading weights...')
@@ -35,14 +32,18 @@ def main(args):
     print('Shape of network\'s output:', out_test.shape)
 
     # Load weights and images from given paths
+    if eval(args.weights) is None:
+        args.weights = DEFAULT_MODEL
     weights_path = os.path.join(os.getcwd(), args.weights)
-    image_paths = sorted(glob.glob(os.path.join(os.getcwd(), 'test_images', '*.png')))  # Specify Image file type
+    image_paths = sorted(glob.glob(os.path.join(os.getcwd(), args.test_images, '*[!_pred].{}'.format(args.test_images_type))))  # Specify Image file type
 
     network.load_weights(weights_path)
     print('Weights from {} loaded correctly.'.format(weights_path))
+    print('*'*20 + 'START PREDICTION' + '*'*20)
 
     inference_times = []
     for image_path in image_paths:
+        print('-'*20 + image_path + '-'*20)
         t0 = time.time()
 
         image = read_image(image_path, (img_h, img_w))
@@ -57,27 +58,24 @@ def main(args):
         t1 = time.time()
 
         # Save segmentation
-        save_path = image_path.replace('test_images', 'test_segmentations')
+        save_path = image_path.replace('.{}'.format(args.test_images_type), '_pred.{}'.format(args.test_images_type))
         segmentation = Image.fromarray(y_pred_colors.numpy())
         segmentation.save(save_path)
 
-        print()
-        print('Segmentation of image\n {}\nsaved in\n {}.'.format(image_path, save_path))
         inference_times.append(t1 - t0)
 
         # Print detected_objects
-        print()
         _, tail = os.path.split(image_path)
         foreground_sounds, background_sounds = get_sounds(image_path, save_path, y_pred_labels, dataset)
         foreground_sounds_dict[tail] = foreground_sounds
         background_sounds_dict[tail] = background_sounds
     mean_inference_time = sum(inference_times) / len(inference_times)
-    print('\nAverage inference time: {:.3f} s'.format(mean_inference_time))
+    print('-'*20 + 'PREDICTION STATS' + '-'*20)
+    print('Average inference time: {:.3f} s'.format(mean_inference_time))
     return foreground_sounds_dict, background_sounds_dict
 
 
 def get_sounds(image_path, save_path, y_pred_labels, dataset):
-    print('Prediction of image\n{}'.format(image_path, save_path))
     unique, counts = np.unique(y_pred_labels.numpy(), return_counts=True)
     detected_objects_dict = dict(zip(unique, counts))
     topitems = heapq.nlargest(1, detected_objects_dict.items(), key=itemgetter(1))
@@ -101,13 +99,13 @@ def get_sounds(image_path, save_path, y_pred_labels, dataset):
 
 
 if __name__ == '__main__':
-    os.chdir("extended_prototype")
     parser = argparse.ArgumentParser()
 
     parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing')
     parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing')
-    parser.add_argument('--weights', type=str, default="pretrained/pretrained.h5",
-                        help='Relative path of network weights')
+    parser.add_argument('--weights', type=str, default="None", help='Relative path of network weights')
+    parser.add_argument('--test_images', type=str, default="data/test_images/", help='Relative path of the test images')
+    parser.add_argument('--test_images_type', type=str, default="jpg", help='Test image types')
 
     args = parser.parse_args()
     main(args)
diff --git a/soundscape_generation/sound_generation.py b/soundscape_generation/sound_generation.py
new file mode 100644
index 0000000..5257f78
--- /dev/null
+++ b/soundscape_generation/sound_generation.py
@@ -0,0 +1,31 @@
+import argparse
+import warnings
+
+import soundscape_generation.predict as predict
+from soundscape_generation.generation.sound_generation import SoundGenerator
+
+warnings.filterwarnings(action='ignore')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--img_height', type=int, default=512, help='Image height after resizing')
+    parser.add_argument('--img_width', type=int, default=1024, help='Image width after resizing')
+    parser.add_argument('--weights', type=str, default="None", help='Relative path of network weights')
+    parser.add_argument('--test_images', type=str, default="data/test_images/", help='Relative path of the test images')
+    parser.add_argument('--test_images_type', type=str, default="jpg", help='Test image types')
+    args = parser.parse_args()
+
+    # predict the test images to get the objects in fore- and background
+    foreground_objects, background_objects = predict.main(args)
+    image_names = list(foreground_objects.keys())
+    foreground_objects_list = list(foreground_objects.values())
+    background_objects_list = list(background_objects.values())
+
+    # instantiate the sound generator
+    soundscape_generator = SoundGenerator(foreground_sounds=foreground_objects_list,
+                                          background_sounds=background_objects_list,
+                                          image_names=image_names)
+    # generate for each image 3 different soundscapes
+    number_soundscapes_per_image = range(0, 3)
+    soundscape_generator.generate_sound(number_soundscapes_per_image)
diff --git a/soundscape-generation/train.py b/soundscape_generation/train.py
similarity index 64%
rename from soundscape-generation/train.py
rename to soundscape_generation/train.py
index a20100b..fcf34d8 100644
--- a/soundscape-generation/train.py
+++ b/soundscape_generation/train.py
@@ -1,33 +1,36 @@
 import argparse
-import math
 import numpy as np
 import os
-import random
 import tensorflow as tf
 import tensorflow_addons as tfa
 import time
-from ERFNet import ERFNet
-from cityscapes import CityscapesDataset
-from evaluation import evaluate
-from losses import weighted_cross_entropy_loss
+from soundscape_generation.models.ERFNet import ERFNet
+from soundscape_generation.dataset.cityscapes import CityscapesDataset
+from soundscape_generation.eval.evaluation import evaluate
+from soundscape_generation.loss.losses import weighted_cross_entropy_loss
+from soundscape_generation.utils.utils import create_folder_for_experiment, set_gpu_experimental_growth
 
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 
 def main(args):
-    if not os.path.exists('saved_weights'):
-        os.makedirs('saved_weights')
-        print('saved_weights directory created.')
+    set_gpu_experimental_growth()
 
     num_epochs = args.num_epochs
     batch_size = args.batch_size
     val_batch_size = args.val_batch_size
     img_h, img_w = args.img_height, args.img_width
 
-    dataset = CityscapesDataset()  # create dataset
-    network = ERFNet(dataset.num_classes)  # create network
+    # create dataset and model
+    dataset = CityscapesDataset()
+    network = ERFNet(dataset.num_classes)
 
-    # Initialize weights of the network
+    if eval(args.model_to_load) is None:
+        experiment_path = create_folder_for_experiment(network.model_name, dataset.name)
+    else:
+        experiment_path = os.path.join(os.getcwd(), args.model_to_load)
+
+    # initialize weights of the network
     inp_test = tf.random.normal(shape=(batch_size, img_h, img_w, 3))
     out_test = network(inp_test, is_training=False)
     print('Network created. Output shape: {}.'.format(out_test.shape))
@@ -37,14 +40,15 @@ def main(args):
     total_update_steps = num_epochs * num_batches_per_epoch
     print('Total update steps:', total_update_steps)
 
-    # Define optimizer (with learning rate schedule)
+    # define optimizer (with learning rate schedule)
     lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=8e-4,
-                                                                decay_steps=total_update_steps, end_learning_rate=0.0,
+                                                                decay_steps=total_update_steps,
+                                                                end_learning_rate=0.0,
                                                                 power=0.9)
     opt = tfa.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=1e-4)
 
-    # Manage checkpoints (keep track of network weights, optimizer state, and last epoch id)
-    ckpt_path = os.path.join(os.getcwd(), 'checkpoints')
+    # manage checkpoints (keep track of network weights, optimizer state, and last epoch id)
+    ckpt_path = os.path.join(experiment_path, 'checkpoints')
     ckpt = tf.train.Checkpoint(network=network, opt=opt, last_saved_epoch=tf.Variable(-1))
     ckpt_manager = tf.train.CheckpointManager(ckpt, ckpt_path, max_to_keep=3)
 
@@ -57,54 +61,36 @@ def main(args):
         initial_epoch = 0
         print('No checkpoint restored. Training from scratch.')
 
-    @tf.function
-    def train_step(x, y_true_labels):
-        print('Tracing training step...')
-
-        # Forward pass
-        with tf.GradientTape() as tape:
-            y_pred_logits = network(x)  # (batch_size, img_h, img_w, num_classes)
-            loss = weighted_cross_entropy_loss(y_true_labels, y_pred_logits, dataset.class_weights)
-
-        # Backward pass
-        grads = tape.gradient(loss, network.trainable_variables)
-        opt.apply_gradients(zip(grads, network.trainable_variables))
-
-        return loss
-
-    # Training
+    # training
     start = time.time()
     for epoch in range(initial_epoch, num_epochs):
         dataset.shuffle_training_paths()
         for batch in range(num_batches_per_epoch):
             x, y_true_labels = dataset.get_training_batch(batch, batch_size, (img_h, img_w))
-            loss = train_step(x, y_true_labels)
+            loss = train_step(network, dataset, opt, x, y_true_labels)
 
-            # Print information about the current batch
+            # print information about the current batch
             if (batch + 1) % args.print_every == 0:
                 current_step = int(opt.iterations)
                 current_lr = opt.learning_rate(current_step)
                 elapsed_time = time.time() - start
                 print('[Epoch {}/{}. Batch {}/{}]'.format(epoch + 1, num_epochs, batch + 1, num_batches_per_epoch),
                       end=' ')
-                print(
-                    'Training batch loss: {:.2f}. Elapsed time: {:.1f} s. Schedule: (step {}, lr {:.1e}).'.format(loss,
-                                                                                                                  elapsed_time,
-                                                                                                                  current_step,
-                                                                                                                  current_lr))
+                print('Training batch loss: {:.2f}. Elapsed time: {:.1f} s. Schedule: (step {}, lr {:.1e}).'.format(
+                    loss, elapsed_time, current_step, current_lr))
 
-        # Save training checkpoint after each epoch
+        # save training checkpoint after each epoch
         ckpt.last_saved_epoch.assign_add(1)
         ckpt_save_path = ckpt_manager.save()
         print('Checkpoint saved at {}.'.format(ckpt_save_path))
 
-        # Save current weights as a h5 file
+        # save current weights as a h5 file
         if (epoch + 1) % args.save_weights_every == 0:
-            h5_save_path = os.path.join(os.getcwd(), 'saved_weights', 'weights-{}.h5'.format(epoch + 1))
+            h5_save_path = os.path.join(experiment_path, 'saved_weights', 'weights-{}.h5'.format(epoch + 1))
             network.save_weights(h5_save_path)
             print('Weights saved at {}.'.format(h5_save_path))
 
-        # Compute the IoU score on validation set
+        # compute the IoU score on validation set
         if (epoch + 1) % args.evaluate_every == 0:
             iou_per_class, iou_mean = evaluate(dataset, network, val_batch_size, (img_h, img_w))
             print('Mean IoU: {:.4f}'.format(iou_mean))
@@ -112,6 +98,22 @@ def train_step(x, y_true_labels):
                 print('IoU per class: {}'.format(iou_per_class.numpy()))
 
 
+@tf.function
+def train_step(model, dataset, opt, x, y_true_labels):
+    print('Tracing training step...')
+
+    # forward pass
+    with tf.GradientTape() as tape:
+        y_pred_logits = model(x)  # (batch_size, img_h, img_w, num_classes)
+        loss = weighted_cross_entropy_loss(y_true_labels, y_pred_logits, dataset.class_weights)
+
+    # backward pass
+    grads = tape.gradient(loss, model.trainable_variables)
+    opt.apply_gradients(zip(grads, model.trainable_variables))
+
+    return loss
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
 
@@ -124,6 +126,7 @@ def train_step(x, y_true_labels):
     parser.add_argument('--print_every', type=int, default=5, help='Number of batches between batch logs')
     parser.add_argument('--evaluate_every', type=int, default=1, help='Number of epochs between evaluations')
     parser.add_argument('--save_weights_every', type=int, default=1, help='Number of epochs between saves')
+    parser.add_argument('--model_to_load', type=str, default=None, help='Path of the pre-trained model that should be loaded')
 
     args = parser.parse_args()
     main(args)
diff --git a/soundscape-generation/utils/__init__.py b/soundscape_generation/utils/__init__.py
similarity index 100%
rename from soundscape-generation/utils/__init__.py
rename to soundscape_generation/utils/__init__.py
diff --git a/soundscape-generation/utils/utils.py b/soundscape_generation/utils/utils.py
similarity index 55%
rename from soundscape-generation/utils/utils.py
rename to soundscape_generation/utils/utils.py
index ad57dc7..77ad5a0 100644
--- a/soundscape-generation/utils/utils.py
+++ b/soundscape_generation/utils/utils.py
@@ -1,4 +1,46 @@
+import os
 import tensorflow as tf
+from datetime import datetime
+
+
+def create_folder(path):
+    """
+    Creates a folder at a given path.
+    :param path: the folder path to create.
+    :return: the path of the newly created path.
+    """
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+    return path
+
+
+def create_folder_for_experiment(model_name, dataset_name):
+    # create experiment folder for current dataset
+    experiment_path = create_folder(os.path.join(os.getcwd(), 'experiments'))
+    experiment_path = create_folder(os.path.join(experiment_path, dataset_name))
+
+    # create current experiment folder
+    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
+    experiment_folder_name = "{0}-{1}".format(model_name, current_time)
+    experiment_path = create_folder(os.path.join(experiment_path, experiment_folder_name))
+    print('Created experiment path at: {}'.format(experiment_path))
+
+    return experiment_path
+
+
+def set_gpu_experimental_growth():
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    if gpus:
+        try:
+            # Currently, memory growth needs to be the same across GPUs
+            for gpu in gpus:
+                tf.config.experimental.set_memory_growth(gpu, True)
+            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
+            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
+        except RuntimeError as e:
+            # Memory growth must be set before GPUs have been initialized
+            print(e)
 
 
 def normalize_image(img):  # map pixel intensities to float32 in [-1, 1]