diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..932f948
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.pyc
+__pycache__/
+*.egg-info/
+outputs/
+trained_models/*.ckpt
+monoscene_output/*
+core*
+images
+.vscode
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..20a532a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,308 @@
+# MonoScene: Monocular 3D Semantic Scene Completion
+
+**MonoScene: Monocular 3D Semantic Scene Completion**] \[[arXiv + supp](https://arxiv.org/abs/2112.00726)\] | \[[Project page](https://cv-rits.github.io/MonoScene/)\] \
+[Anh-Quan Cao](https://anhquancao.github.io),
+[Raoul de Charette](https://team.inria.fr/rits/membres/raoul-de-charette/)  
+Inria, Paris, France
+
+If you find this work useful, please cite our [paper](https://arxiv.org/abs/2112.00726):
+```
+@misc{cao2021monoscene,
+      title={MonoScene: Monocular 3D Semantic Scene Completion}, 
+      author={Anh-Quan Cao and Raoul de Charette},
+      year={2021},
+      eprint={2112.00726},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+# Teaser
+
+
+|SemanticKITTI | KITTI-360 <br/>(Trained on SemanticKITTI) |
+|:------------:|:------:|
+|<img src="./teaser/SemKITTI.gif"  />|<img src="./teaser/KITTI-360.gif" />|
+
+
+<p align="center">
+  <b>NYUv2</b>
+</p>
+<p align="center">
+  <img src="./teaser/NYUv2.gif" style="width:48%"/>
+</p>
+
+# Table of Content
+- [Preparing MonoScene](#preparing-monoscene)
+  - [Installation](#installation)  
+  - [Datasets](#datasets)
+  - [Pretrained models](#pretrained-models)
+- [Running MonoScene](#running-monoscene)
+  - [Training](#training)
+  - [Evaluating](#evaluating)
+- [Inference & Visualization](#inference--visualization)
+  - [Inference](#inference)
+  - [Visualization](#visualization)
+- [License](#license)
+
+
+# Preparing MonoScene
+
+## Installation
+
+1. Create conda environment:
+
+```
+$ conda create -y -n monoscene python=3.7
+$ conda activate monoscene
+```
+2. This code was implemented with python 3.7, pytorch 1.7.1 and CUDA 10.2. Please install [PyTorch](https://pytorch.org/): 
+
+```
+$ conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.2 -c pytorch
+```
+
+3. Install the additional dependencies:
+
+```
+$ cd MonoScene/
+$ pip install -r requirements.txt
+```
+
+4. Install tbb:
+
+```
+$ conda install tbb
+```
+
+5. Finally, install MonoScene:
+
+```
+$ pip install -e ./
+```
+
+
+## Datasets
+
+
+### SemanticKITTI
+
+1. Download the Full Semantic Scene Completion dataset (v1.1) from the [SemanticKITII website](http://www.semantic-kitti.org/dataset.html).
+
+
+2. Create a folder to store SemanticKITTI preprocess data at `/path/to/kitti/preprocess/folder`.
+
+3. Store paths in environment variables for faster access (**Note: folder 'dataset' is in /path/to/semantic_kitti**):
+
+```
+$ export KITTI_PREPROCESS=/path/to/kitti/preprocess/folder
+$ export KITTI_ROOT=/path/to/semantic_kitti 
+```
+
+4. Preprocess the data to generate labels at a lower scale, which are used to compute the ground truth relation matrices:
+
+```
+$ cd MonoScene/
+$ python monoscene/data/semantic_kitti/preprocess.py kitti_root=$KITTI_ROOT kitti_preprocess_root=$KITTI_PREPROCESS
+```
+
+### NYUv2
+
+1. Download the NYUv2 dataset using [this script](https://github.com/shurans/sscnet/blob/master/download_data.sh).
+
+2. Create a folder to store NYUv2 preprocess data at `/path/to/NYU/preprocess/folder`.
+
+3. Store paths in environment variables for faster access:
+
+```
+$ export NYU_PREPROCESS=/path/to/NYU/preprocess/folder
+$ export NYU_ROOT=/path/to/NYU/depthbin 
+```
+
+4. Preprocess the data to generate labels at a lower scale, which are used to compute the ground truth relation matrices:
+
+```
+$ cd MonoScene/
+$ python monoscene/data/NYU/preprocess.py NYU_root=$NYU_ROOT NYU_preprocess_root=$NYU_PREPROCESS
+
+```
+
+### KITTI-360
+
+1. We only perform inference on KITTI-360. You can download either the **Perspective Images for Train & Val (128G)** or the **Perspective Images for Test (1.5G)** at [http://www.cvlibs.net/datasets/kitti-360/download.php](http://www.cvlibs.net/datasets/kitti-360/download.php).
+
+2. Create a folder to store KITTI-360 data at `/path/to/KITTI-360/folder`.
+
+3. Store paths in environment variables for faster access:
+
+```
+$ export KITTI_360_ROOT=/path/to/KITTI-360
+```
+
+## Pretrained models
+
+Download MonoScene pretrained models [on SemanticKITTI](https://ln5.sync.com/dl/af5cebf20/wzyre4yr-2tbj4s2a-rk9pqtk7-6rnujurg) and [on NYUv2](https://ln5.sync.com/dl/f4646c5b0/24anmiua-rp5pnie8-jwbrwax5-smxn5fqi), then put them in the folder `/path/to/MonoScene/trained_models`.
+
+
+# Running MonoScene
+
+## Training
+
+To train MonoScene with SemanticKITTI, type:
+
+### SemanticKITTI
+
+1. Create folders to store training logs at **/path/to/kitti/logdir**.
+
+2. Store in an environment variable:
+
+```
+$ export KITTI_LOG=/path/to/kitti/logdir
+```
+
+3. Train MonoScene using 4 GPUs with batch_size of 4 (1 item per GPU) on Semantic KITTI:
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/train_monoscene.py \
+    dataset=kitti \
+    enable_log=true \
+    kitti_root=$KITTI_ROOT \
+    kitti_preprocess_root=$KITTI_PREPROCESS\
+    kitti_logdir=$KITTI_LOG \
+    n_gpus=4 batch_size=4    
+```
+
+### NYUv2
+
+1. Create folders to store training logs at **/path/to/NYU/logdir**.
+
+2. Store in an environment variable:
+
+```
+$ export NYU_LOG=/path/to/NYU/logdir
+```
+
+3.  Train MonoScene using 2 GPUs with batch_size of 4 (2 item per GPU) on NYUv2:
+```
+$ cd MonoScene/
+$ python monoscene/scripts/train_monoscene.py \
+    dataset=NYU \
+    NYU_root=$NYU_ROOT \
+    NYU_preprocess_root=$NYU_PREPROCESS \
+    logdir=$NYU_LOG \
+    n_gpus=2 batch_size=4
+
+```
+
+
+## Evaluating 
+
+### SemanticKITTI
+
+To evaluate MonoScene on SemanticKITTI validation set, type:
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/eval_monoscene.py \
+    dataset=kitti \
+    kitti_root=$KITTI_ROOT \
+    kitti_preprocess_root=$KITTI_PREPROCESS \
+    n_gpus=1 batch_size=1
+```
+
+### NYUv2
+
+To evaluate MonoScene on NYUv2 test set, type:
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/eval_monoscene.py \
+    dataset=NYU \
+    NYU_root=$NYU_ROOT\
+    NYU_preprocess_root=$NYU_PREPROCESS \
+    n_gpus=1 batch_size=1
+```
+
+# Inference & Visualization
+
+## Inference
+
+Please create folder **/path/to/monoscene/output** to store the MonoScene outputs and store in environment variable:
+
+```
+export MONOSCENE_OUTPUT=/path/to/monoscene/output
+```
+
+### NYUv2
+
+To generate the predictions on the NYUv2 test set, type:
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/generate_output.py \
+    +output_path=$MONOSCENE_OUTPUT \
+    dataset=NYU \
+    NYU_root=$NYU_ROOT \
+    NYU_preprocess_root=$NYU_PREPROCESS \
+    n_gpus=1 batch_size=1
+```
+
+### Semantic KITTI
+
+To generate the predictions on the Semantic KITTI validation set, type:
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/generate_output.py \
+    +output_path=$MONOSCENE_OUTPUT \
+    dataset=kitti \
+    kitti_root=$KITTI_ROOT \
+    kitti_preprocess_root=$KITTI_PREPROCESS \
+    n_gpus=1 batch_size=1
+```
+
+### KITTI-360
+
+Here we use the sequence **2013_05_28_drive_0009_sync**, you can use other sequences. To generate the predictions on KITTI-360, type:
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/generate_output.py \
+    +output_path=/path/to/monoscene/output \
+    dataset=kitti_360 \
+    +kitti_360_root=$KITTI_360_ROOT \
+    +kitti_360_sequence=2013_05_28_drive_0009_sync  \
+    n_gpus=1 batch_size=1
+```
+
+## Visualization
+
+We use mayavi to visualize the predictions. Please install mayavi following the [official installation instruction](https://docs.enthought.com/mayavi/mayavi/installation.html). Then, use the following commands to visualize the outputs on respective datasets.
+
+### NYUv2 
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/visualization/NYU_vis_pred.py +file=/path/to/output/file.pkl
+```
+
+### Semantic KITTI 
+
+```
+$ cd MonoScene/
+$ python monoscene/scripts/visualization/kitti_vis_pred.py +file=/path/to/output/file.pkl +dataset=kitt
+```
+
+
+### KITTI-360
+
+```
+$ cd MonoScene/ 
+$ python monoscene/scripts/visualization/kitti_vis_pred.py +file=/path/to/output/file.pkl +dataset=kitti_360
+```
+
+
+
+# License
+MonoScene is released under the [Apache 2.0 license](./LICENSE).
diff --git a/monoscene/config/monoscene.yaml b/monoscene/config/monoscene.yaml
new file mode 100644
index 0000000..f2eafe5
--- /dev/null
+++ b/monoscene/config/monoscene.yaml
@@ -0,0 +1,39 @@
+#dataset: "NYU" # "kitti", "kitti_360"
+dataset: "kitti_360"
+
+n_relations: 4
+
+enable_log: false
+kitti_root: '/path/to/semantic_kitti'
+kitti_preprocess_root: '/path/to/kitti/preprocess/folder'
+kitti_logdir: '/path/to/semantic_kitti/logdir'
+
+NYU_root: '/path/to/NYU/depthbin'
+NYU_preprocess_root: '/path/to/NYU/preprocess/folder'
+logdir: '/path/to/NYU/logdir'
+
+
+fp_loss: true 
+frustum_size: 8 
+batch_size: 1
+n_gpus: 1
+num_workers_per_gpu: 3 
+exp_prefix: "exp"
+run: 1
+lr: 1e-4
+weight_decay: 1e-4 
+
+context_prior: true
+
+relation_loss: true 
+CE_ssc_loss: true
+sem_scal_loss: true
+geo_scal_loss: true
+
+project_1_2: true
+project_1_4: true
+project_1_8: true
+
+
+
+
diff --git a/monoscene/data/NYU/collate.py b/monoscene/data/NYU/collate.py
new file mode 100644
index 0000000..e8db095
--- /dev/null
+++ b/monoscene/data/NYU/collate.py
@@ -0,0 +1,50 @@
+import torch
+
+
+def collate_fn(batch):
+    data = {}
+    imgs = []
+    targets = []
+    names = []
+    cam_poses = []
+
+    vox_origins = []
+    cam_ks = []
+
+    CP_mega_matrices = []
+
+    data["projected_pix_1"] = []
+    data["fov_mask_1"] = []
+    data["frustums_masks"] = []
+    data["frustums_class_dists"] = []
+
+    for idx, input_dict in enumerate(batch):
+        CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
+        for key in data:
+            if key in input_dict:
+                data[key].append(torch.from_numpy(input_dict[key]))
+
+        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
+        cam_poses.append(torch.from_numpy(input_dict["cam_pose"]).float())
+        vox_origins.append(torch.from_numpy(input_dict["voxel_origin"]).double())
+
+        names.append(input_dict["name"])
+
+        img = input_dict["img"]
+        imgs.append(img)
+
+        target = torch.from_numpy(input_dict["target"])
+        targets.append(target)
+
+    ret_data = {
+        "CP_mega_matrices": CP_mega_matrices,
+        "cam_pose": torch.stack(cam_poses),
+        "cam_k": torch.stack(cam_ks),
+        "vox_origin": torch.stack(vox_origins),
+        "name": names,
+        "img": torch.stack(imgs),
+        "target": torch.stack(targets),
+    }
+    for key in data:
+        ret_data[key] = data[key]
+    return ret_data
diff --git a/monoscene/data/NYU/nyu_dataset.py b/monoscene/data/NYU/nyu_dataset.py
new file mode 100644
index 0000000..950226e
--- /dev/null
+++ b/monoscene/data/NYU/nyu_dataset.py
@@ -0,0 +1,132 @@
+import torch
+import os
+import glob
+from torch.utils.data import Dataset
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from monoscene.data.utils.helpers import (
+    vox2pix,
+    compute_local_frustums,
+    compute_CP_mega_matrix,
+)
+import pickle
+import torch.nn.functional as F
+
+
+class NYUDataset(Dataset):
+    def __init__(
+        self,
+        split,
+        root,
+        preprocess_root,
+        n_relations=4,
+        color_jitter=None,
+        frustum_size=4,
+        fliplr=0.0,
+    ):
+        self.n_relations = n_relations
+        self.frustum_size = frustum_size
+        self.n_classes = 12
+        self.root = os.path.join(root, "NYU" + split)
+        self.preprocess_root = preprocess_root
+        self.base_dir = os.path.join(preprocess_root, "base", "NYU" + split)
+        self.fliplr = fliplr
+
+        self.voxel_size = 0.08  # 0.08m
+        self.scene_size = (4.8, 4.8, 2.88)  # (4.8m, 4.8m, 2.88m)
+        self.img_W = 640
+        self.img_H = 480
+        self.cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]])
+
+        self.color_jitter = (
+            transforms.ColorJitter(*color_jitter) if color_jitter else None
+        )
+
+        self.scan_names = glob.glob(os.path.join(self.root, "*.bin"))
+
+        self.normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+
+    def __getitem__(self, index):
+        file_path = self.scan_names[index]
+        filename = os.path.basename(file_path)
+        name = filename[:-4]
+
+        os.makedirs(self.base_dir, exist_ok=True)
+        filepath = os.path.join(self.base_dir, name + ".pkl")
+
+        with open(filepath, "rb") as handle:
+            data = pickle.load(handle)
+
+        cam_pose = data["cam_pose"]
+        T_world_2_cam = np.linalg.inv(cam_pose)
+        vox_origin = data["voxel_origin"]
+        data["cam_k"] = self.cam_k
+        target = data[
+            "target_1_4"
+        ]  # Following SSC literature, the output resolution on NYUv2 is set to 1:4
+        data["target"] = target
+        target_1_4 = data["target_1_16"]
+
+        CP_mega_matrix = compute_CP_mega_matrix(
+            target_1_4, is_binary=self.n_relations == 2
+        )
+        data["CP_mega_matrix"] = CP_mega_matrix
+
+        # compute the 3D-2D mapping
+        projected_pix, fov_mask, sensor_distance = vox2pix(
+            T_world_2_cam,
+            self.cam_k,
+            vox_origin,
+            self.voxel_size,
+            self.img_W,
+            self.img_H,
+            self.scene_size,
+        )
+        data["projected_pix_1"] = projected_pix
+        data["fov_mask_1"] = fov_mask
+
+        # compute the masks, each indicates voxels inside a frustum
+        frustums_masks, frustums_class_dists = compute_local_frustums(
+            projected_pix,
+            sensor_distance,
+            target,
+            self.img_W,
+            self.img_H,
+            dataset="NYU",
+            n_classes=12,
+            size=self.frustum_size,
+        )
+        data["frustums_masks"] = frustums_masks
+        data["frustums_class_dists"] = frustums_class_dists
+
+        rgb_path = os.path.join(self.root, name + "_color.jpg")
+        img = Image.open(rgb_path).convert("RGB")
+
+        # Image augmentation
+        if self.color_jitter is not None:
+            img = self.color_jitter(img)
+
+        # PIL to numpy
+        img = np.array(img, dtype=np.float32, copy=False) / 255.0
+
+        # randomly fliplr the image
+        if np.random.rand() < self.fliplr:
+            img = np.ascontiguousarray(np.fliplr(img))
+            data["projected_pix_1"][:, 0] = (
+                img.shape[1] - 1 - data["projected_pix_1"][:, 0]
+            )
+
+        data["img"] = self.normalize_rgb(img)  # (3, img_H, img_W)
+
+        return data
+
+    def __len__(self):
+        return len(self.scan_names)
diff --git a/monoscene/data/NYU/nyu_dm.py b/monoscene/data/NYU/nyu_dm.py
new file mode 100644
index 0000000..50d12d1
--- /dev/null
+++ b/monoscene/data/NYU/nyu_dm.py
@@ -0,0 +1,78 @@
+from torch.utils.data.dataloader import DataLoader
+from monoscene.data.NYU.nyu_dataset import NYUDataset
+from monoscene.data.NYU.collate import collate_fn
+import pytorch_lightning as pl
+from monoscene.data.utils.torch_util import worker_init_fn
+
+
+class NYUDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        root,
+        preprocess_root,
+        n_relations=4,
+        batch_size=4,
+        frustum_size=4,
+        num_workers=6,
+    ):
+        super().__init__()
+        self.n_relations = n_relations
+        self.preprocess_root = preprocess_root
+        self.root = root
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.frustum_size = frustum_size
+
+    def setup(self):
+        self.train_ds = NYUDataset(
+            split="train",
+            preprocess_root=self.preprocess_root,
+            n_relations=self.n_relations,
+            root=self.root,
+            fliplr=0.5,
+            frustum_size=self.frustum_size,
+            color_jitter=(0.4, 0.4, 0.4),
+        )
+        self.test_ds = NYUDataset(
+            split="test",
+            preprocess_root=self.preprocess_root,
+            n_relations=self.n_relations,
+            root=self.root,
+            frustum_size=self.frustum_size,
+            fliplr=0.0,
+            color_jitter=None,
+        )
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_ds,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=self.num_workers,
+            shuffle=True,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_ds,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            drop_last=False,
+            shuffle=False,
+            pin_memory=True,
+            collate_fn=collate_fn,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_ds,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            drop_last=False,
+            shuffle=False,
+            pin_memory=True,
+            collate_fn=collate_fn,
+        )
diff --git a/monoscene/data/NYU/params.py b/monoscene/data/NYU/params.py
new file mode 100644
index 0000000..332e19e
--- /dev/null
+++ b/monoscene/data/NYU/params.py
@@ -0,0 +1,54 @@
+import torch
+import numpy as np
+
+NYU_class_names = [
+    "empty",
+    "ceiling",
+    "floor",
+    "wall",
+    "window",
+    "chair",
+    "bed",
+    "sofa",
+    "table",
+    "tvs",
+    "furn",
+    "objs",
+]
+class_weights = torch.FloatTensor([0.05, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+
+class_freq_1_4 = np.array(
+    [
+        43744234,
+        80205,
+        1070052,
+        905632,
+        116952,
+        180994,
+        436852,
+        279714,
+        254611,
+        28247,
+        1805949,
+        850724,
+    ]
+)
+class_freq_1_8 = np.array(
+    [
+        5176253,
+        17277,
+        220105,
+        183849,
+        21827,
+        33520,
+        67022,
+        44248,
+        46615,
+        4419,
+        290218,
+        142573,
+    ]
+)
+class_freq_1_16 = np.array(
+    [587620, 3820, 46836, 36256, 4241, 5978, 10939, 8000, 8224, 781, 49778, 25864]
+)
diff --git a/monoscene/data/NYU/preprocess.py b/monoscene/data/NYU/preprocess.py
new file mode 100644
index 0000000..83f7aa2
--- /dev/null
+++ b/monoscene/data/NYU/preprocess.py
@@ -0,0 +1,182 @@
+import numpy as np
+from tqdm import tqdm
+import numpy.matlib
+import os
+import glob
+import pickle
+import hydra
+from omegaconf import DictConfig
+
+
+seg_class_map = [
+    0,
+    1,
+    2,
+    3,
+    4,
+    11,
+    5,
+    6,
+    7,
+    8,
+    8,
+    10,
+    10,
+    10,
+    11,
+    11,
+    9,
+    8,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    11,
+    10,
+    10,
+    11,
+    8,
+    10,
+    11,
+    9,
+    11,
+    11,
+    11,
+]
+
+
+def _rle2voxel(rle, voxel_size=(240, 144, 240), rle_filename=""):
+    r"""Read voxel label data from file (RLE compression), and convert it to fully occupancy labeled voxels.
+    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L172
+    In the data loader of pytorch, only single thread is allowed.
+    For multi-threads version and more details, see 'readRLE.py'.
+    output: seg_label: 3D numpy array, size 240 x 144 x 240
+    """
+    seg_label = np.zeros(
+        int(voxel_size[0] * voxel_size[1] * voxel_size[2]), dtype=np.uint8
+    )  # segmentation label
+    vox_idx = 0
+    for idx in range(int(rle.shape[0] / 2)):
+        check_val = rle[idx * 2]
+        check_iter = rle[idx * 2 + 1]
+        if check_val >= 37 and check_val != 255:  # 37 classes to 12 classes
+            print("RLE {} check_val: {}".format(rle_filename, check_val))
+        seg_label_val = (
+            seg_class_map[check_val] if check_val != 255 else 255
+        )  # 37 classes to 12 classes
+        seg_label[vox_idx : vox_idx + check_iter] = np.matlib.repmat(
+            seg_label_val, 1, check_iter
+        )
+        vox_idx = vox_idx + check_iter
+    seg_label = seg_label.reshape(voxel_size)  # 3D array, size 240 x 144 x 240
+    return seg_label
+
+
+def _read_rle(rle_filename):  # 0.0005s
+    """Read RLE compression data
+    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L153
+    Return:
+        vox_origin,
+        cam_pose,
+        vox_rle, voxel label data from file
+    Shape:
+        vox_rle, (240, 144, 240)
+    """
+    fid = open(rle_filename, "rb")
+    vox_origin = np.fromfile(
+        fid, np.float32, 3
+    ).T  # Read voxel origin in world coordinates
+    cam_pose = np.fromfile(fid, np.float32, 16).reshape((4, 4))  # Read camera pose
+    vox_rle = (
+        np.fromfile(fid, np.uint32).reshape((-1, 1)).T
+    )  # Read voxel label data from file
+    vox_rle = np.squeeze(vox_rle)  # 2d array: (1 x N), to 1d array: (N , )
+    fid.close()
+    return vox_origin, cam_pose, vox_rle
+
+
+def _downsample_label(label, voxel_size=(240, 144, 240), downscale=4):
+    r"""downsample the labeled data,
+    code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L262
+    Shape:
+        label, (240, 144, 240)
+        label_downscale, if downsample==4, then (60, 36, 60)
+    """
+    if downscale == 1:
+        return label
+    ds = downscale
+    small_size = (
+        voxel_size[0] // ds,
+        voxel_size[1] // ds,
+        voxel_size[2] // ds,
+    )  # small size
+    label_downscale = np.zeros(small_size, dtype=np.uint8)
+    empty_t = 0.95 * ds * ds * ds  # threshold
+    s01 = small_size[0] * small_size[1]
+    label_i = np.zeros((ds, ds, ds), dtype=np.int32)
+
+    for i in range(small_size[0] * small_size[1] * small_size[2]):
+        z = int(i / s01)
+        y = int((i - z * s01) / small_size[0])
+        x = int(i - z * s01 - y * small_size[0])
+
+        label_i[:, :, :] = label[
+            x * ds : (x + 1) * ds, y * ds : (y + 1) * ds, z * ds : (z + 1) * ds
+        ]
+        label_bin = label_i.flatten()
+
+        zero_count_0 = np.array(np.where(label_bin == 0)).size
+        zero_count_255 = np.array(np.where(label_bin == 255)).size
+
+        zero_count = zero_count_0 + zero_count_255
+        if zero_count > empty_t:
+            label_downscale[x, y, z] = 0 if zero_count_0 > zero_count_255 else 255
+        else:
+            label_i_s = label_bin[
+                np.where(np.logical_and(label_bin > 0, label_bin < 255))
+            ]
+            label_downscale[x, y, z] = np.argmax(np.bincount(label_i_s))
+    return label_downscale
+
+
+@hydra.main(config_name="../../config/monoscene.yaml")
+def main(config: DictConfig):
+    scene_size = (240, 144, 240)
+    for split in ["train", "test"]:
+        root = os.path.join(config.NYU_root, "NYU" + split)
+        base_dir = os.path.join(config.NYU_preprocess_root, "base", "NYU" + split)
+        os.makedirs(base_dir, exist_ok=True)
+
+        scans = glob.glob(os.path.join(root, "*.bin"))
+        for scan in tqdm(scans):
+            filename = os.path.basename(scan)
+            name = filename[:-4]
+            filepath = os.path.join(base_dir, name + ".pkl")
+            if os.path.exists(filepath):
+                continue
+
+            vox_origin, cam_pose, rle = _read_rle(scan)
+
+            target_1_1 = _rle2voxel(rle, scene_size, scan)
+            target_1_4 = _downsample_label(target_1_1, scene_size, 4)
+            target_1_16 = _downsample_label(target_1_1, scene_size, 16)
+
+            data = {
+                "cam_pose": cam_pose,
+                "voxel_origin": vox_origin,
+                "name": name,
+                "target_1_4": target_1_4,
+                "target_1_16": target_1_16,
+            }
+
+            with open(filepath, "wb") as handle:
+                pickle.dump(data, handle)
+                print("wrote to", filepath)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/monoscene/data/kitti_360/collate.py b/monoscene/data/kitti_360/collate.py
new file mode 100644
index 0000000..59ea29d
--- /dev/null
+++ b/monoscene/data/kitti_360/collate.py
@@ -0,0 +1,47 @@
+import torch
+
+
+def collate_fn(batch):
+    data = {}
+    imgs = []
+    frame_ids = []
+    img_paths = []
+    sequences = []
+
+    cam_ks = []
+    T_velo_2_cams = []
+
+    scale_3ds = batch[0]["scale_3ds"]
+    for scale_3d in scale_3ds:
+        data["projected_pix_{}".format(scale_3d)] = []
+        data["fov_mask_{}".format(scale_3d)] = []
+
+    for _, input_dict in enumerate(batch):
+        if "img_path" in input_dict:
+            img_paths.append(input_dict["img_path"])
+
+        for key in data:
+            data[key].append(torch.from_numpy(input_dict[key]))
+
+        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).float())
+        T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
+
+        sequences.append(input_dict["sequence"])
+
+        img = input_dict["img"]
+        imgs.append(img)
+
+        frame_ids.append(input_dict["frame_id"])
+
+        ret_data = {
+            "sequence": sequences,
+            "frame_id": frame_ids,
+            "cam_k": cam_ks,
+            "T_velo_2_cam": T_velo_2_cams,
+            "img": torch.stack(imgs),
+            "img_path": img_paths,
+        }
+    for key in data:
+        ret_data[key] = data[key]
+
+    return ret_data
diff --git a/monoscene/data/kitti_360/kitti_360_dataset.py b/monoscene/data/kitti_360/kitti_360_dataset.py
new file mode 100644
index 0000000..566c2ec
--- /dev/null
+++ b/monoscene/data/kitti_360/kitti_360_dataset.py
@@ -0,0 +1,125 @@
+import torch
+import os
+import glob
+from torch.utils.data import Dataset
+import numpy as np
+from monoscene.data.utils.helpers import vox2pix
+from PIL import Image
+from torchvision import transforms
+
+
+class Kitti360Dataset(Dataset):
+    def __init__(self, root, sequences, n_scans):
+        """
+        Paramters
+        --------
+        root: str
+            Path to KITTI-360 dataset i.e. contain sequences such as 2013_05_28_drive_0009_sync
+        sequence: str
+            KITTI-360 sequence e.g. 2013_05_28_drive_0009_sync
+        n_scans: int
+            Only use the first n_scans since KITTI-360 sequence is very long
+        """
+        self.root = root
+        self.img_H = 376
+        self.img_W = 1408
+        self.project_scale = 2
+        self.output_scale = 1
+        self.voxel_size = 0.2
+        self.vox_origin = np.array([0, -25.6, -2])
+        self.scene_size = (51.2, 51.2, 6.4)
+        self.T_velo_2_cam = self.get_velo2cam()
+        self.cam_k = self.get_cam_k()
+        self.scans = []
+        for sequence in sequences:
+            glob_path = os.path.join(
+                self.root, "data_2d_raw", sequence, "image_00/data_rect", "*.png"
+            )
+            for img_path in glob.glob(glob_path):
+                self.scans.append({"img_path": img_path, "sequence": sequence})
+        self.scans = self.scans[:n_scans]
+        self.normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+
+    def __len__(self):
+        return len(self.scans)
+
+    def get_cam_k(self):
+        cam_k = np.array(
+            [
+                552.554261,
+                0.000000,
+                682.049453,
+                0.000000,
+                0.000000,
+                552.554261,
+                238.769549,
+                0.000000,
+                0.000000,
+                0.000000,
+                1.000000,
+                0.000000,
+            ]
+        ).reshape(3, 4)
+        return cam_k[:3, :3]
+
+    def get_velo2cam(self):
+        cam2velo = np.array(
+            [
+                0.04307104361,
+                -0.08829286498,
+                0.995162929,
+                0.8043914418,
+                -0.999004371,
+                0.007784614041,
+                0.04392796942,
+                0.2993489574,
+                -0.01162548558,
+                -0.9960641394,
+                -0.08786966659,
+                -0.1770225824,
+            ]
+        ).reshape(3, 4)
+        cam2velo = np.concatenate(
+            [cam2velo, np.array([0, 0, 0, 1]).reshape(1, 4)], axis=0
+        )
+        return np.linalg.inv(cam2velo)
+
+    def __getitem__(self, index):
+        data = {"T_velo_2_cam": self.T_velo_2_cam, "cam_k": self.cam_k}
+        scan = self.scans[index]
+        img_path = scan["img_path"]
+        sequence = scan["sequence"]
+        filename = os.path.basename(img_path)
+        frame_id = os.path.splitext(filename)[0]
+        data["frame_id"] = frame_id
+        data["img_path"] = img_path
+        data["sequence"] = sequence
+
+        img = Image.open(img_path).convert("RGB")
+        img = np.array(img, dtype=np.float32, copy=False) / 255.0
+        img = self.normalize_rgb(img)
+        data["img"] = img
+
+        scale_3ds = [self.project_scale, self.output_scale]
+        data["scale_3ds"] = scale_3ds
+        T_cam_2_velo = np.linalg.inv(self.T_velo_2_cam)
+        for scale_3d in scale_3ds:
+            projected_pix, fov_mask, _ = vox2pix(
+                T_cam_2_velo,
+                self.cam_k,
+                self.vox_origin,
+                self.voxel_size * scale_3d,
+                self.img_W,
+                self.img_H,
+                self.scene_size,
+            )
+            data["projected_pix_{}".format(scale_3d)] = projected_pix
+            data["fov_mask_{}".format(scale_3d)] = fov_mask
+        return data
diff --git a/monoscene/data/kitti_360/kitti_360_dm.py b/monoscene/data/kitti_360/kitti_360_dm.py
new file mode 100644
index 0000000..0743de9
--- /dev/null
+++ b/monoscene/data/kitti_360/kitti_360_dm.py
@@ -0,0 +1,32 @@
+from torch.utils.data.dataloader import DataLoader
+from monoscene.data.kitti_360.kitti_360_dataset import Kitti360Dataset
+import pytorch_lightning as pl
+from monoscene.data.kitti_360.collate import collate_fn
+from monoscene.data.utils.torch_util import worker_init_fn
+
+
+class Kitti360DataModule(pl.LightningDataModule):
+    def __init__(self, root, sequences, n_scans, batch_size=4, num_workers=3):
+        super().__init__()
+        self.root = root
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.sequences = sequences
+        self.n_scans = n_scans
+
+    def setup(self, stage=None):
+        self.ds = Kitti360Dataset(
+            root=self.root, sequences=self.sequences, n_scans=self.n_scans
+        )
+
+    def dataloader(self):
+        return DataLoader(
+            self.ds,
+            batch_size=self.batch_size,
+            drop_last=False,
+            num_workers=self.num_workers,
+            shuffle=False,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
diff --git a/monoscene/data/semantic_kitti/collate.py b/monoscene/data/semantic_kitti/collate.py
new file mode 100644
index 0000000..eecb8e7
--- /dev/null
+++ b/monoscene/data/semantic_kitti/collate.py
@@ -0,0 +1,59 @@
+import torch
+
+
+def collate_fn(batch):
+    data = {}
+    imgs = []
+    CP_mega_matrices = []
+    targets = []
+    frame_ids = []
+    sequences = []
+
+    cam_ks = []
+    T_velo_2_cams = []
+    frustums_masks = []
+    frustums_class_dists = []
+
+    scale_3ds = batch[0]["scale_3ds"]
+    for scale_3d in scale_3ds:
+        data["projected_pix_{}".format(scale_3d)] = []
+        data["fov_mask_{}".format(scale_3d)] = []
+
+    for idx, input_dict in enumerate(batch):
+        cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double())
+        T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float())
+
+        if "frustums_masks" in input_dict:
+            frustums_masks.append(torch.from_numpy(input_dict["frustums_masks"]))
+            frustums_class_dists.append(
+                torch.from_numpy(input_dict["frustums_class_dists"]).float()
+            )
+
+        for key in data:
+            data[key].append(torch.from_numpy(input_dict[key]))
+
+        img = input_dict["img"]
+        imgs.append(img)
+
+        frame_ids.append(input_dict["frame_id"])
+        sequences.append(input_dict["sequence"])
+
+        ret_data = {
+            "frame_id": frame_ids,
+            "sequence": sequences,
+            "frustums_class_dists": frustums_class_dists,
+            "frustums_masks": frustums_masks,
+            "cam_k": cam_ks,
+            "T_velo_2_cam": T_velo_2_cams,
+            "img": torch.stack(imgs),
+        }
+        if "target" in input_dict:
+            target = torch.from_numpy(input_dict["target"])
+            targets.append(target)
+            CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"]))
+            ret_data["CP_mega_matrices"] = CP_mega_matrices
+            ret_data["target"] = torch.stack(targets)
+
+    for key in data:
+        ret_data[key] = data[key]
+    return ret_data
diff --git a/monoscene/data/semantic_kitti/io_data.py b/monoscene/data/semantic_kitti/io_data.py
new file mode 100644
index 0000000..5cf5ffd
--- /dev/null
+++ b/monoscene/data/semantic_kitti/io_data.py
@@ -0,0 +1,239 @@
+"""
+Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py
+"""
+
+import numpy as np
+import yaml
+import imageio
+
+
+def unpack(compressed):
+  ''' given a bit encoded voxel grid, make a normal voxel grid out of it.  '''
+  uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8)
+  uncompressed[::8] = compressed[:] >> 7 & 1
+  uncompressed[1::8] = compressed[:] >> 6 & 1
+  uncompressed[2::8] = compressed[:] >> 5 & 1
+  uncompressed[3::8] = compressed[:] >> 4 & 1
+  uncompressed[4::8] = compressed[:] >> 3 & 1
+  uncompressed[5::8] = compressed[:] >> 2 & 1
+  uncompressed[6::8] = compressed[:] >> 1 & 1
+  uncompressed[7::8] = compressed[:] & 1
+
+  return uncompressed
+
+
+def img_normalize(img, mean, std):
+  img = img.astype(np.float32) / 255.0
+  img = img - mean
+  img = img / std
+
+  return img
+
+
+def pack(array):
+  """ convert a boolean array into a bitwise array. """
+  array = array.reshape((-1))
+
+  #compressing bit flags.
+  # yapf: disable
+  compressed = array[::8] << 7 | array[1::8] << 6  | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8]
+  # yapf: enable
+
+  return np.array(compressed, dtype=np.uint8)
+
+
+def get_grid_coords(dims, resolution):
+  '''
+  :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+  :return coords_grid: is the center coords of voxels in the grid
+  '''
+
+  # The sensor in centered in X (we go to dims/2 + 1 for the histogramdd)
+  g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1)
+  # The sensor is in Y=0 (we go to dims + 1 for the histogramdd)
+  g_yy = np.arange(0, dims[1] + 1)
+  # The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10
+  # if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22
+  # (we go to 22 + 1 for the histogramdd)
+  # ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49)
+  sensor_pose = 10
+  g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1)
+
+  # Obtaining the grid with coords...
+  xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+  coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+  coords_grid = coords_grid.astype(np.float)
+
+  coords_grid = (coords_grid * resolution) + resolution/2
+
+  temp = np.copy(coords_grid)
+  temp[:, 0] = coords_grid[:, 1]
+  temp[:, 1] = coords_grid[:, 0]
+  coords_grid = np.copy(temp)
+
+  return coords_grid, g_xx, g_yy, g_zz
+
+
+def _get_remap_lut(config_path):
+  '''
+  remap_lut to remap classes of semantic kitti for training...
+  :return:
+  '''
+
+  dataset_config = yaml.safe_load(open(config_path, 'r'))
+  # make lookup table for mapping
+  maxkey = max(dataset_config['learning_map'].keys())
+
+  # +100 hack making lut bigger just in case there are unknown labels
+  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
+  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
+
+  # in completion we have to distinguish empty and invalid voxels.
+  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
+  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
+  remap_lut[0] = 0  # only 'empty' stays 'empty'.
+
+  return remap_lut
+
+
+def get_inv_map():
+  '''
+  remap_lut to remap classes of semantic kitti for training...
+  :return:
+  '''
+  config_path = "./semantic-kitti.yaml"
+  dataset_config = yaml.safe_load(open(config_path, 'r'))
+  # make lookup table for mapping
+
+  inv_map = np.zeros(20, dtype=np.int32)
+  inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values())
+
+  return inv_map
+
+def _read_SemKITTI(path, dtype, do_unpack):
+  bin = np.fromfile(path, dtype=dtype)  # Flattened array
+  if do_unpack:
+    bin = unpack(bin)
+  return bin
+
+
+def _read_label_SemKITTI(path):
+  label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32)
+  return label
+
+
+def _read_invalid_SemKITTI(path):
+  invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
+  return invalid
+
+
+def _read_occluded_SemKITTI(path):
+  occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True)
+  return occluded
+
+
+def _read_occupancy_SemKITTI(path):
+  occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32)
+  return occupancy
+
+
+def _read_rgb_SemKITTI(path):
+  rgb = np.asarray(imageio.imread(path))
+  return rgb
+
+
+def _read_pointcloud_SemKITTI(path):
+  'Return pointcloud semantic kitti with remissions (x, y, z, intensity)'
+  pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False)
+  pointcloud = pointcloud.reshape((-1, 4))
+  return pointcloud
+
+
+def _read_calib_SemKITTI(calib_path):
+  """
+  :param calib_path: Path to a calibration text file.
+  :return: dict with calibration matrices.
+  """
+  calib_all = {}
+  with open(calib_path, 'r') as f:
+    for line in f.readlines():
+      if line == '\n':
+        break
+      key, value = line.split(':', 1)
+      calib_all[key] = np.array([float(x) for x in value.split()])
+
+  # reshape matrices
+  calib_out = {}
+  calib_out['P2'] = calib_all['P2'].reshape(3, 4)  # 3x4 projection matrix for left camera
+  calib_out['Tr'] = np.identity(4)  # 4x4 matrix
+  calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4)
+  return calib_out
+
+
+def get_remap_lut(path):
+  '''
+  remap_lut to remap classes of semantic kitti for training...
+  :return:
+  '''
+
+  dataset_config = yaml.safe_load(open(path, 'r'))
+
+  # make lookup table for mapping
+  maxkey = max(dataset_config['learning_map'].keys())
+
+  # +100 hack making lut bigger just in case there are unknown labels
+  remap_lut = np.zeros((maxkey + 100), dtype=np.int32)
+  remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values())
+
+  # in completion we have to distinguish empty and invalid voxels.
+  # Important: For voxels 0 corresponds to "empty" and not "unlabeled".
+  remap_lut[remap_lut == 0] = 255  # map 0 to 'invalid'
+  remap_lut[0] = 0  # only 'empty' stays 'empty'.
+
+  return remap_lut
+
+
+def data_augmentation_3Dflips(flip, data):
+  # The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data
+  # into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in:
+  # https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663
+  # Dims -> {XZY}
+  # Flipping around the X axis...
+  if np.isclose(flip, 1):
+    data = np.flip(data, axis=0).copy()
+
+  # Flipping around the Y axis...
+  if np.isclose(flip, 2):
+    data = np.flip(data, 2).copy()
+
+  # Flipping around the X and the Y axis...
+  if np.isclose(flip, 3):
+    data = np.flip(np.flip(data, axis=0), axis=2).copy()
+
+  return data
+
+
+def get_cmap_semanticKITTI20():
+  colors = np.array([
+    # [0  , 0  , 0, 255],
+    [100, 150, 245, 255],
+    [100, 230, 245, 255],
+    [30, 60, 150, 255],
+    [80, 30, 180, 255],
+    [100, 80, 250, 255],
+    [255, 30, 30, 255],
+    [255, 40, 200, 255],
+    [150, 30, 90, 255],
+    [255, 0, 255, 255],
+    [255, 150, 255, 255],
+    [75, 0, 75, 255],
+    [175, 0, 75, 255],
+    [255, 200, 0, 255],
+    [255, 120, 50, 255],
+    [0, 175, 0, 255],
+    [135, 60, 0, 255],
+    [150, 240, 80, 255],
+    [255, 240, 150, 255],
+    [255, 0, 0, 255]]).astype(np.uint8)
+
+  return colors
diff --git a/monoscene/data/semantic_kitti/kitti_dataset.py b/monoscene/data/semantic_kitti/kitti_dataset.py
new file mode 100644
index 0000000..980b052
--- /dev/null
+++ b/monoscene/data/semantic_kitti/kitti_dataset.py
@@ -0,0 +1,200 @@
+import torch
+import os
+import glob
+from torch.utils.data import Dataset
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from monoscene.data.utils.helpers import (
+    vox2pix,
+    compute_local_frustums,
+    compute_CP_mega_matrix,
+)
+
+
+class KittiDataset(Dataset):
+    def __init__(
+        self,
+        split,
+        root,
+        preprocess_root,
+        project_scale=2,
+        frustum_size=4,
+        color_jitter=None,
+        fliplr=0.0,
+    ):
+        super().__init__()
+        self.root = root
+        self.label_root = os.path.join(preprocess_root, "labels")
+        self.n_classes = 20
+        splits = {
+            "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"],
+            "val": ["08"],
+            "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"],
+        }
+        self.split = split
+        self.sequences = splits[split]
+        self.frustum_size = frustum_size
+        self.project_scale = project_scale
+        self.output_scale = int(self.project_scale / 2)
+        self.scene_size = (51.2, 51.2, 6.4)
+        self.vox_origin = np.array([0, -25.6, -2])
+        self.fliplr = fliplr
+
+        self.voxel_size = 0.2  # 0.2m
+        self.img_W = 1220
+        self.img_H = 370
+
+        self.color_jitter = (
+            transforms.ColorJitter(*color_jitter) if color_jitter else None
+        )
+        self.scans = []
+        for sequence in self.sequences:
+            calib = self.read_calib(
+                os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt")
+            )
+            P = calib["P2"]
+            T_velo_2_cam = calib["Tr"]
+            proj_matrix = P @ T_velo_2_cam
+
+            glob_path = os.path.join(
+                self.root, "dataset", "sequences", sequence, "voxels", "*.bin"
+            )
+            for voxel_path in glob.glob(glob_path):
+                self.scans.append(
+                    {
+                        "sequence": sequence,
+                        "P": P,
+                        "T_velo_2_cam": T_velo_2_cam,
+                        "proj_matrix": proj_matrix,
+                        "voxel_path": voxel_path,
+                    }
+                )
+
+        self.normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+
+    def __getitem__(self, index):
+        scan = self.scans[index]
+        voxel_path = scan["voxel_path"]
+        sequence = scan["sequence"]
+        P = scan["P"]
+        T_velo_2_cam = scan["T_velo_2_cam"]
+        proj_matrix = scan["proj_matrix"]
+
+        filename = os.path.basename(voxel_path)
+        frame_id = os.path.splitext(filename)[0]
+
+        rgb_path = os.path.join(
+            self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png"
+        )
+
+        data = {
+            "frame_id": frame_id,
+            "sequence": sequence,
+            "P": P,
+            "T_velo_2_cam": T_velo_2_cam,
+            "proj_matrix": proj_matrix,
+        }
+        scale_3ds = [self.output_scale, self.project_scale]
+        data["scale_3ds"] = scale_3ds
+        cam_k = P[0:3, 0:3]
+        data["cam_k"] = cam_k
+        for scale_3d in scale_3ds:
+
+            # compute the 3D-2D mapping
+            projected_pix, fov_mask, sensor_distance = vox2pix(
+                T_velo_2_cam,
+                cam_k,
+                self.vox_origin,
+                self.voxel_size * scale_3d,
+                self.img_W,
+                self.img_H,
+                self.scene_size,
+            )
+
+            data["projected_pix_{}".format(scale_3d)] = projected_pix
+            data["sensor_distance_{}".format(scale_3d)] = sensor_distance
+            data["fov_mask_{}".format(scale_3d)] = fov_mask
+
+        target_1_path = os.path.join(self.label_root, sequence, frame_id + "_1_1.npy")
+        target = np.load(target_1_path)
+        data["target"] = target
+        target_8_path = os.path.join(self.label_root, sequence, frame_id + "_1_8.npy")
+        target_1_8 = np.load(target_8_path)
+        CP_mega_matrix = compute_CP_mega_matrix(target_1_8)
+        data["CP_mega_matrix"] = CP_mega_matrix
+
+        # Compute the masks, each indicate the voxels of a local frustum
+        if self.split != "test":
+            projected_pix_output = data["projected_pix_{}".format(self.output_scale)]
+            sensor_distance_output = data[
+                "sensor_distance_{}".format(self.output_scale)
+            ]
+            frustums_masks, frustums_class_dists = compute_local_frustums(
+                projected_pix_output,
+                sensor_distance_output,
+                target,
+                self.img_W,
+                self.img_H,
+                dataset="kitti",
+                n_classes=20,
+                size=self.frustum_size,
+            )
+        else:
+            frustums_masks = None
+            frustums_class_dists = None
+        data["frustums_masks"] = frustums_masks
+        data["frustums_class_dists"] = frustums_class_dists
+
+        img = Image.open(rgb_path).convert("RGB")
+
+        # Image augmentation
+        if self.color_jitter is not None:
+            img = self.color_jitter(img)
+
+        # PIL to numpy
+        img = np.array(img, dtype=np.float32, copy=False) / 255.0
+        img = img[:370, :1220, :]  # crop image
+
+        # Fliplr the image
+        if np.random.rand() < self.fliplr:
+            img = np.ascontiguousarray(np.fliplr(img))
+            for scale in scale_3ds:
+                key = "projected_pix_" + str(scale)
+                data[key][:, 0] = img.shape[1] - 1 - data[key][:, 0]
+
+        data["img"] = self.normalize_rgb(img)
+        return data
+
+    def __len__(self):
+        return len(self.scans)
+
+    @staticmethod
+    def read_calib(calib_path):
+        """
+        Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
+        :param calib_path: Path to a calibration text file.
+        :return: dict with calibration matrices.
+        """
+        calib_all = {}
+        with open(calib_path, "r") as f:
+            for line in f.readlines():
+                if line == "\n":
+                    break
+                key, value = line.split(":", 1)
+                calib_all[key] = np.array([float(x) for x in value.split()])
+
+        # reshape matrices
+        calib_out = {}
+        # 3x4 projection matrix for left camera
+        calib_out["P2"] = calib_all["P2"].reshape(3, 4)
+        calib_out["Tr"] = np.identity(4)  # 4x4 matrix
+        calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
+        return calib_out
diff --git a/monoscene/data/semantic_kitti/kitti_dm.py b/monoscene/data/semantic_kitti/kitti_dm.py
new file mode 100644
index 0000000..e9dcb4d
--- /dev/null
+++ b/monoscene/data/semantic_kitti/kitti_dm.py
@@ -0,0 +1,91 @@
+from torch.utils.data.dataloader import DataLoader
+from monoscene.data.semantic_kitti.kitti_dataset import KittiDataset
+import pytorch_lightning as pl
+from monoscene.data.semantic_kitti.collate import collate_fn
+from monoscene.data.utils.torch_util import worker_init_fn
+
+
+class KittiDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        root,
+        preprocess_root,
+        project_scale=2,
+        frustum_size=4,
+        batch_size=4,
+        num_workers=6,
+    ):
+        super().__init__()
+        self.root = root
+        self.preprocess_root = preprocess_root
+        self.project_scale = project_scale
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.frustum_size = frustum_size
+
+    def setup(self, stage=None):
+        self.train_ds = KittiDataset(
+            split="train",
+            root=self.root,
+            preprocess_root=self.preprocess_root,
+            project_scale=self.project_scale,
+            frustum_size=self.frustum_size,
+            fliplr=0.5,
+            color_jitter=(0.4, 0.4, 0.4),
+        )
+
+        self.val_ds = KittiDataset(
+            split="val",
+            root=self.root,
+            preprocess_root=self.preprocess_root,
+            project_scale=self.project_scale,
+            frustum_size=self.frustum_size,
+            fliplr=0,
+            color_jitter=None,
+        )
+
+        self.test_ds = KittiDataset(
+            split="test",
+            root=self.root,
+            preprocess_root=self.preprocess_root,
+            project_scale=self.project_scale,
+            frustum_size=self.frustum_size,
+            fliplr=0,
+            color_jitter=None,
+        )
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_ds,
+            batch_size=self.batch_size,
+            drop_last=True,
+            num_workers=self.num_workers,
+            shuffle=True,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_ds,
+            batch_size=self.batch_size,
+            drop_last=False,
+            num_workers=self.num_workers,
+            shuffle=False,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_ds,
+            batch_size=self.batch_size,
+            drop_last=False,
+            num_workers=self.num_workers,
+            shuffle=False,
+            pin_memory=True,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate_fn,
+        )
diff --git a/monoscene/data/semantic_kitti/params.py b/monoscene/data/semantic_kitti/params.py
new file mode 100644
index 0000000..c58df35
--- /dev/null
+++ b/monoscene/data/semantic_kitti/params.py
@@ -0,0 +1,48 @@
+import numpy as np
+
+semantic_kitti_class_frequencies = np.array(
+    [
+        5.41773033e09,
+        1.57835390e07,
+        1.25136000e05,
+        1.18809000e05,
+        6.46799000e05,
+        8.21951000e05,
+        2.62978000e05,
+        2.83696000e05,
+        2.04750000e05,
+        6.16887030e07,
+        4.50296100e06,
+        4.48836500e07,
+        2.26992300e06,
+        5.68402180e07,
+        1.57196520e07,
+        1.58442623e08,
+        2.06162300e06,
+        3.69705220e07,
+        1.15198800e06,
+        3.34146000e05,
+    ]
+)
+kitti_class_names = [
+    "empty",
+    "car",
+    "bicycle",
+    "motorcycle",
+    "truck",
+    "other-vehicle",
+    "person",
+    "bicyclist",
+    "motorcyclist",
+    "road",
+    "parking",
+    "sidewalk",
+    "other-ground",
+    "building",
+    "fence",
+    "vegetation",
+    "trunk",
+    "terrain",
+    "pole",
+    "traffic-sign",
+]
diff --git a/monoscene/data/semantic_kitti/preprocess.py b/monoscene/data/semantic_kitti/preprocess.py
new file mode 100644
index 0000000..31bf76c
--- /dev/null
+++ b/monoscene/data/semantic_kitti/preprocess.py
@@ -0,0 +1,102 @@
+"""
+Code partly taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/labels_downscale.py
+"""
+import numpy as np
+from tqdm import tqdm
+import numpy.matlib
+import os
+import glob
+import hydra
+from omegaconf import DictConfig
+import monoscene.data.semantic_kitti.io_data as SemanticKittiIO
+from hydra.utils import get_original_cwd
+from monoscene.data.NYU.preprocess import _downsample_label
+
+
+def majority_pooling(grid, k_size=2):
+    result = np.zeros(
+        (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
+    )
+    for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
+        for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
+            for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
+
+                sub_m = grid[
+                    (xx * k_size) : (xx * k_size) + k_size,
+                    (yy * k_size) : (yy * k_size) + k_size,
+                    (zz * k_size) : (zz * k_size) + k_size,
+                ]
+                unique, counts = np.unique(sub_m, return_counts=True)
+                if True in ((unique != 0) & (unique != 255)):
+                    # Remove counts with 0 and 255
+                    counts = counts[((unique != 0) & (unique != 255))]
+                    unique = unique[((unique != 0) & (unique != 255))]
+                else:
+                    if True in (unique == 0):
+                        counts = counts[(unique != 255)]
+                        unique = unique[(unique != 255)]
+                value = unique[np.argmax(counts)]
+                result[xx, yy, zz] = value
+    return result
+
+
+@hydra.main(config_name="../../config/monoscene.yaml")
+def main(config: DictConfig):
+    scene_size = (256, 256, 32)
+    sequences = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"]
+    remap_lut = SemanticKittiIO.get_remap_lut(
+        os.path.join(
+            get_original_cwd(),
+            "monoscene",
+            "data",
+            "semantic_kitti",
+            "semantic-kitti.yaml",
+        )
+    )
+
+    for sequence in sequences:
+        sequence_path = os.path.join(
+            config.kitti_root, "dataset", "sequences", sequence
+        )
+        label_paths = sorted(
+            glob.glob(os.path.join(sequence_path, "voxels", "*.label"))
+        )
+        invalid_paths = sorted(
+            glob.glob(os.path.join(sequence_path, "voxels", "*.invalid"))
+        )
+        out_dir = os.path.join(config.kitti_preprocess_root, "labels", sequence)
+        os.makedirs(out_dir, exist_ok=True)
+
+        downscaling = {"1_1": 1, "1_8": 8}
+
+        for i in tqdm(range(len(label_paths))):
+
+            frame_id, extension = os.path.splitext(os.path.basename(label_paths[i]))
+
+            LABEL = SemanticKittiIO._read_label_SemKITTI(label_paths[i])
+            INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_paths[i])
+            LABEL = remap_lut[LABEL.astype(np.uint16)].astype(
+                np.float32
+            )  # Remap 20 classes semanticKITTI SSC
+            LABEL[
+                np.isclose(INVALID, 1)
+            ] = 255  # Setting to unknown all voxels marked on invalid mask...
+            LABEL = LABEL.reshape([256, 256, 32])
+
+            for scale in downscaling:
+                filename = frame_id + "_" + scale + ".npy"
+                label_filename = os.path.join(out_dir, filename)
+                # If files have not been created...
+                if not os.path.exists(label_filename):
+                    if scale == "1_8":
+                        LABEL_ds = _downsample_label(
+                            LABEL, (256, 256, 32), downscaling[scale]
+                        )
+                    else:
+                        LABEL_ds = LABEL
+                    np.save(label_filename, LABEL_ds)
+                    print("wrote to", label_filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/monoscene/data/semantic_kitti/semantic-kitti.yaml b/monoscene/data/semantic_kitti/semantic-kitti.yaml
new file mode 100644
index 0000000..d6749c3
--- /dev/null
+++ b/monoscene/data/semantic_kitti/semantic-kitti.yaml
@@ -0,0 +1,213 @@
+# This file is covered by the LICENSE file in the root of this project.
+nbr_classes: 20
+grid_dims: [256, 32, 256]  # (W, H, D)
+labels:
+  0 : "unlabeled"
+  1 : "outlier"
+  10: "car"
+  11: "bicycle"
+  13: "bus"
+  15: "motorcycle"
+  16: "on-rails"
+  18: "truck"
+  20: "other-vehicle"
+  30: "person"
+  31: "bicyclist"
+  32: "motorcyclist"
+  40: "road"
+  44: "parking"
+  48: "sidewalk"
+  49: "other-ground"
+  50: "building"
+  51: "fence"
+  52: "other-structure"
+  60: "lane-marking"
+  70: "vegetation"
+  71: "trunk"
+  72: "terrain"
+  80: "pole"
+  81: "traffic-sign"
+  99: "other-object"
+  252: "moving-car"
+  253: "moving-bicyclist"
+  254: "moving-person"
+  255: "moving-motorcyclist"
+  256: "moving-on-rails"
+  257: "moving-bus"
+  258: "moving-truck"
+  259: "moving-other-vehicle"
+color_map: # bgr
+  0 : [0, 0, 0]
+  1 : [0, 0, 255]
+  10: [245, 150, 100]
+  11: [245, 230, 100]
+  13: [250, 80, 100]
+  15: [150, 60, 30]
+  16: [255, 0, 0]
+  18: [180, 30, 80]
+  20: [255, 0, 0]
+  30: [30, 30, 255]
+  31: [200, 40, 255]
+  32: [90, 30, 150]
+  40: [255, 0, 255]
+  44: [255, 150, 255]
+  48: [75, 0, 75]
+  49: [75, 0, 175]
+  50: [0, 200, 255]
+  51: [50, 120, 255]
+  52: [0, 150, 255]
+  60: [170, 255, 150]
+  70: [0, 175, 0]
+  71: [0, 60, 135]
+  72: [80, 240, 150]
+  80: [150, 240, 255]
+  81: [0, 0, 255]
+  99: [255, 255, 50]
+  252: [245, 150, 100]
+  256: [255, 0, 0]
+  253: [200, 40, 255]
+  254: [30, 30, 255]
+  255: [90, 30, 150]
+  257: [250, 80, 100]
+  258: [180, 30, 80]
+  259: [255, 0, 0]
+content: # as a ratio with the total number of points
+  0: 0.018889854628292943
+  1: 0.0002937197336781505
+  10: 0.040818519255974316
+  11: 0.00016609538710764618
+  13: 2.7879693665067774e-05
+  15: 0.00039838616015114444
+  16: 0.0
+  18: 0.0020633612104619787
+  20: 0.0016218197275284021
+  30: 0.00017698551338515307
+  31: 1.1065903904919655e-08
+  32: 5.532951952459828e-09
+  40: 0.1987493871255525
+  44: 0.014717169549888214
+  48: 0.14392298360372
+  49: 0.0039048553037472045
+  50: 0.1326861944777486
+  51: 0.0723592229456223
+  52: 0.002395131480328884
+  60: 4.7084144280367186e-05
+  70: 0.26681502148037506
+  71: 0.006035012012626033
+  72: 0.07814222006271769
+  80: 0.002855498193863172
+  81: 0.0006155958086189918
+  99: 0.009923127583046915
+  252: 0.001789309418528068
+  253: 0.00012709999297008662
+  254: 0.00016059776092534436
+  255: 3.745553104802113e-05
+  256: 0.0
+  257: 0.00011351574470342043
+  258: 0.00010157861367183268
+  259: 4.3840131989471124e-05
+# classes that are indistinguishable from single scan or inconsistent in
+# ground truth are mapped to their closest equivalent
+learning_map:
+  0 : 0     # "unlabeled"
+  1 : 0     # "outlier" mapped to "unlabeled" --------------------------mapped
+  10: 1     # "car"
+  11: 2     # "bicycle"
+  13: 5     # "bus" mapped to "other-vehicle" --------------------------mapped
+  15: 3     # "motorcycle"
+  16: 5     # "on-rails" mapped to "other-vehicle" ---------------------mapped
+  18: 4     # "truck"
+  20: 5     # "other-vehicle"
+  30: 6     # "person"
+  31: 7     # "bicyclist"
+  32: 8     # "motorcyclist"
+  40: 9     # "road"
+  44: 10    # "parking"
+  48: 11    # "sidewalk"
+  49: 12    # "other-ground"
+  50: 13    # "building"
+  51: 14    # "fence"
+  52: 0     # "other-structure" mapped to "unlabeled" ------------------mapped
+  60: 9     # "lane-marking" to "road" ---------------------------------mapped
+  70: 15    # "vegetation"
+  71: 16    # "trunk"
+  72: 17    # "terrain"
+  80: 18    # "pole"
+  81: 19    # "traffic-sign"
+  99: 0     # "other-object" to "unlabeled" ----------------------------mapped
+  252: 1    # "moving-car" to "car" ------------------------------------mapped
+  253: 7    # "moving-bicyclist" to "bicyclist" ------------------------mapped
+  254: 6    # "moving-person" to "person" ------------------------------mapped
+  255: 8    # "moving-motorcyclist" to "motorcyclist" ------------------mapped
+  256: 5    # "moving-on-rails" mapped to "other-vehicle" --------------mapped
+  257: 5    # "moving-bus" mapped to "other-vehicle" -------------------mapped
+  258: 4    # "moving-truck" to "truck" --------------------------------mapped
+  259: 5    # "moving-other"-vehicle to "other-vehicle" ----------------mapped
+learning_map_inv: # inverse of previous map
+  0: 0      # "unlabeled", and others ignored
+  1: 10     # "car"
+  2: 11     # "bicycle"
+  3: 15     # "motorcycle"
+  4: 18     # "truck"
+  5: 20     # "other-vehicle"
+  6: 30     # "person"
+  7: 31     # "bicyclist"
+  8: 32     # "motorcyclist"
+  9: 40     # "road"
+  10: 44    # "parking"
+  11: 48    # "sidewalk"
+  12: 49    # "other-ground"
+  13: 50    # "building"
+  14: 51    # "fence"
+  15: 70    # "vegetation"
+  16: 71    # "trunk"
+  17: 72    # "terrain"
+  18: 80    # "pole"
+  19: 81    # "traffic-sign"
+learning_ignore: # Ignore classes
+  0: True      # "unlabeled", and others ignored
+  1: False     # "car"
+  2: False     # "bicycle"
+  3: False     # "motorcycle"
+  4: False     # "truck"
+  5: False     # "other-vehicle"
+  6: False     # "person"
+  7: False     # "bicyclist"
+  8: False     # "motorcyclist"
+  9: False     # "road"
+  10: False    # "parking"
+  11: False    # "sidewalk"
+  12: False    # "other-ground"
+  13: False    # "building"
+  14: False    # "fence"
+  15: False    # "vegetation"
+  16: False    # "trunk"
+  17: False    # "terrain"
+  18: False    # "pole"
+  19: False    # "traffic-sign"
+split: # sequence numbers
+  train:
+    - 0
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 9
+    - 10
+  valid:
+    - 8
+  test:
+    - 11
+    - 12
+    - 13
+    - 14
+    - 15
+    - 16
+    - 17
+    - 18
+    - 19
+    - 20
+    - 21
diff --git a/monoscene/data/utils/fusion.py b/monoscene/data/utils/fusion.py
new file mode 100644
index 0000000..aecd5cb
--- /dev/null
+++ b/monoscene/data/utils/fusion.py
@@ -0,0 +1,507 @@
+"""
+Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
+
+@inproceedings{zeng20163dmatch,
+    title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
+    author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
+    booktitle={CVPR},
+    year={2017}
+}
+"""
+
+import numpy as np
+
+from numba import njit, prange
+from skimage import measure
+
+FUSION_GPU_MODE = 0
+
+
+class TSDFVolume:
+    """Volumetric TSDF Fusion of RGB-D Images."""
+
+    def __init__(self, vol_bnds, voxel_size, use_gpu=True):
+        """Constructor.
+
+        Args:
+          vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
+            xyz bounds (min/max) in meters.
+          voxel_size (float): The volume discretization in meters.
+        """
+        vol_bnds = np.asarray(vol_bnds)
+        assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
+
+        # Define voxel volume parameters
+        self._vol_bnds = vol_bnds
+        self._voxel_size = float(voxel_size)
+        self._trunc_margin = 5 * self._voxel_size  # truncation on SDF
+        # self._trunc_margin = 10  # truncation on SDF
+        self._color_const = 256 * 256
+
+        # Adjust volume bounds and ensure C-order contiguous
+        self._vol_dim = (
+            np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
+            .copy(order="C")
+            .astype(int)
+        )
+        self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
+        self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
+
+        print(
+            "Voxel volume size: {} x {} x {} - # points: {:,}".format(
+                self._vol_dim[0],
+                self._vol_dim[1],
+                self._vol_dim[2],
+                self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
+            )
+        )
+
+        # Initialize pointers to voxel volume in CPU memory
+        self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        # for computing the cumulative moving average of observations per voxel
+        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+
+        self.gpu_mode = use_gpu and FUSION_GPU_MODE
+
+        # Copy voxel volumes to GPU
+        if self.gpu_mode:
+            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
+            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
+            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
+
+            # Cuda kernel function (C++)
+            self._cuda_src_mod = SourceModule(
+                """
+        __global__ void integrate(float * tsdf_vol,
+                                  float * weight_vol,
+                                  float * color_vol,
+                                  float * vol_dim,
+                                  float * vol_origin,
+                                  float * cam_intr,
+                                  float * cam_pose,
+                                  float * other_params,
+                                  float * color_im,
+                                  float * depth_im) {
+          // Get voxel index
+          int gpu_loop_idx = (int) other_params[0];
+          int max_threads_per_block = blockDim.x;
+          int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
+          int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
+          int vol_dim_x = (int) vol_dim[0];
+          int vol_dim_y = (int) vol_dim[1];
+          int vol_dim_z = (int) vol_dim[2];
+          if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
+              return;
+          // Get voxel grid coordinates (note: be careful when casting)
+          float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
+          float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
+          float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
+          // Voxel grid coordinates to world coordinates
+          float voxel_size = other_params[1];
+          float pt_x = vol_origin[0]+voxel_x*voxel_size;
+          float pt_y = vol_origin[1]+voxel_y*voxel_size;
+          float pt_z = vol_origin[2]+voxel_z*voxel_size;
+          // World coordinates to camera coordinates
+          float tmp_pt_x = pt_x-cam_pose[0*4+3];
+          float tmp_pt_y = pt_y-cam_pose[1*4+3];
+          float tmp_pt_z = pt_z-cam_pose[2*4+3];
+          float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
+          float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
+          float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
+          // Camera coordinates to image pixels
+          int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
+          int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
+          // Skip if outside view frustum
+          int im_h = (int) other_params[2];
+          int im_w = (int) other_params[3];
+          if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
+              return;
+          // Skip invalid depth
+          float depth_value = depth_im[pixel_y*im_w+pixel_x];
+          if (depth_value == 0)
+              return;
+          // Integrate TSDF
+          float trunc_margin = other_params[4];
+          float depth_diff = depth_value-cam_pt_z;
+          if (depth_diff < -trunc_margin)
+              return;
+          float dist = fmin(1.0f,depth_diff/trunc_margin);
+          float w_old = weight_vol[voxel_idx];
+          float obs_weight = other_params[5];
+          float w_new = w_old + obs_weight;
+          weight_vol[voxel_idx] = w_new;
+          tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
+          // Integrate color
+          float old_color = color_vol[voxel_idx];
+          float old_b = floorf(old_color/(256*256));
+          float old_g = floorf((old_color-old_b*256*256)/256);
+          float old_r = old_color-old_b*256*256-old_g*256;
+          float new_color = color_im[pixel_y*im_w+pixel_x];
+          float new_b = floorf(new_color/(256*256));
+          float new_g = floorf((new_color-new_b*256*256)/256);
+          float new_r = new_color-new_b*256*256-new_g*256;
+          new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
+          new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
+          new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
+          color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
+        }"""
+            )
+
+            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
+
+            # Determine block/grid size on GPU
+            gpu_dev = cuda.Device(0)
+            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
+            n_blocks = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(self._max_gpu_threads_per_block)
+                )
+            )
+            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
+            grid_dim_y = min(
+                gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
+            )
+            grid_dim_z = min(
+                gpu_dev.MAX_GRID_DIM_Z,
+                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
+            )
+            self._max_gpu_grid_dim = np.array(
+                [grid_dim_x, grid_dim_y, grid_dim_z]
+            ).astype(int)
+            self._n_gpu_loops = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(
+                        np.prod(self._max_gpu_grid_dim)
+                        * self._max_gpu_threads_per_block
+                    )
+                )
+            )
+
+        else:
+            # Get voxel grid coordinates
+            xv, yv, zv = np.meshgrid(
+                range(self._vol_dim[0]),
+                range(self._vol_dim[1]),
+                range(self._vol_dim[2]),
+                indexing="ij",
+            )
+            self.vox_coords = (
+                np.concatenate(
+                    [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
+                )
+                .astype(int)
+                .T
+            )
+
+    @staticmethod
+    @njit(parallel=True)
+    def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
+        """Convert voxel grid coordinates to world coordinates."""
+        vol_origin = vol_origin.astype(np.float32)
+        vox_coords = vox_coords.astype(np.float32)
+        #    print(np.min(vox_coords))
+        cam_pts = np.empty_like(vox_coords, dtype=np.float32)
+
+        for i in prange(vox_coords.shape[0]):
+            for j in range(3):
+                cam_pts[i, j] = (
+                    vol_origin[j]
+                    + (vox_size * vox_coords[i, j])
+                    + vox_size * offsets[j]
+                )
+        return cam_pts
+
+    @staticmethod
+    @njit(parallel=True)
+    def cam2pix(cam_pts, intr):
+        """Convert camera coordinates to pixel coordinates."""
+        intr = intr.astype(np.float32)
+        fx, fy = intr[0, 0], intr[1, 1]
+        cx, cy = intr[0, 2], intr[1, 2]
+        pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
+        for i in prange(cam_pts.shape[0]):
+            pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
+            pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
+        return pix
+
+    @staticmethod
+    @njit(parallel=True)
+    def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
+        """Integrate the TSDF volume."""
+        tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
+        # print(tsdf_vol.shape)
+        w_new = np.empty_like(w_old, dtype=np.float32)
+        for i in prange(len(tsdf_vol)):
+            w_new[i] = w_old[i] + obs_weight
+            tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
+        return tsdf_vol_int, w_new
+
+    def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
+        """Integrate an RGB-D frame into the TSDF volume.
+
+        Args:
+          color_im (ndarray): An RGB image of shape (H, W, 3).
+          depth_im (ndarray): A depth image of shape (H, W).
+          cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
+          cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
+          obs_weight (float): The weight to assign for the current observation. A higher
+            value
+        """
+        im_h, im_w = depth_im.shape
+
+        # Fold RGB color image into a single channel image
+        color_im = color_im.astype(np.float32)
+        color_im = np.floor(
+            color_im[..., 2] * self._color_const
+            + color_im[..., 1] * 256
+            + color_im[..., 0]
+        )
+
+        if self.gpu_mode:  # GPU mode: integrate voxel volume (calls CUDA kernel)
+            for gpu_loop_idx in range(self._n_gpu_loops):
+                self._cuda_integrate(
+                    self._tsdf_vol_gpu,
+                    self._weight_vol_gpu,
+                    self._color_vol_gpu,
+                    cuda.InOut(self._vol_dim.astype(np.float32)),
+                    cuda.InOut(self._vol_origin.astype(np.float32)),
+                    cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
+                    cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
+                    cuda.InOut(
+                        np.asarray(
+                            [
+                                gpu_loop_idx,
+                                self._voxel_size,
+                                im_h,
+                                im_w,
+                                self._trunc_margin,
+                                obs_weight,
+                            ],
+                            np.float32,
+                        )
+                    ),
+                    cuda.InOut(color_im.reshape(-1).astype(np.float32)),
+                    cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
+                    block=(self._max_gpu_threads_per_block, 1, 1),
+                    grid=(
+                        int(self._max_gpu_grid_dim[0]),
+                        int(self._max_gpu_grid_dim[1]),
+                        int(self._max_gpu_grid_dim[2]),
+                    ),
+                )
+        else:  # CPU mode: integrate voxel volume (vectorized implementation)
+            # Convert voxel grid coordinates to pixel coordinates
+            cam_pts = self.vox2world(
+                self._vol_origin, self.vox_coords, self._voxel_size
+            )
+            cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
+            pix_z = cam_pts[:, 2]
+            pix = self.cam2pix(cam_pts, cam_intr)
+            pix_x, pix_y = pix[:, 0], pix[:, 1]
+
+            # Eliminate pixels outside view frustum
+            valid_pix = np.logical_and(
+                pix_x >= 0,
+                np.logical_and(
+                    pix_x < im_w,
+                    np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
+                ),
+            )
+            depth_val = np.zeros(pix_x.shape)
+            depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
+
+            # Integrate TSDF
+            depth_diff = depth_val - pix_z
+
+            valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
+            dist = depth_diff
+
+            valid_vox_x = self.vox_coords[valid_pts, 0]
+            valid_vox_y = self.vox_coords[valid_pts, 1]
+            valid_vox_z = self.vox_coords[valid_pts, 2]
+            w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            valid_dist = dist[valid_pts]
+            tsdf_vol_new, w_new = self.integrate_tsdf(
+                tsdf_vals, valid_dist, w_old, obs_weight
+            )
+            self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
+            self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
+
+            # Integrate color
+            old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            old_b = np.floor(old_color / self._color_const)
+            old_g = np.floor((old_color - old_b * self._color_const) / 256)
+            old_r = old_color - old_b * self._color_const - old_g * 256
+            new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
+            new_b = np.floor(new_color / self._color_const)
+            new_g = np.floor((new_color - new_b * self._color_const) / 256)
+            new_r = new_color - new_b * self._color_const - new_g * 256
+            new_b = np.minimum(
+                255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
+            )
+            new_g = np.minimum(
+                255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
+            )
+            new_r = np.minimum(
+                255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
+            )
+            self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
+                new_b * self._color_const + new_g * 256 + new_r
+            )
+
+    def get_volume(self):
+        if self.gpu_mode:
+            cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
+            cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
+        return self._tsdf_vol_cpu, self._color_vol_cpu
+
+    def get_point_cloud(self):
+        """Extract a point cloud from the voxel volume."""
+        tsdf_vol, color_vol = self.get_volume()
+
+        # Marching cubes
+        verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
+        verts_ind = np.round(verts).astype(int)
+        verts = verts * self._voxel_size + self._vol_origin
+
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+
+        pc = np.hstack([verts, colors])
+        return pc
+
+    def get_mesh(self):
+        """Compute a mesh from the voxel volume using marching cubes."""
+        tsdf_vol, color_vol = self.get_volume()
+
+        # Marching cubes
+        verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
+        verts_ind = np.round(verts).astype(int)
+        verts = (
+            verts * self._voxel_size + self._vol_origin
+        )  # voxel grid coordinates to world coordinates
+
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+        return verts, faces, norms, colors
+
+
+def rigid_transform(xyz, transform):
+    """Applies a rigid transform to an (N, 3) pointcloud."""
+    xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
+    xyz_t_h = np.dot(transform, xyz_h.T).T
+    return xyz_t_h[:, :3]
+
+
+def get_view_frustum(depth_im, cam_intr, cam_pose):
+    """Get corners of 3D camera view frustum of depth image"""
+    im_h = depth_im.shape[0]
+    im_w = depth_im.shape[1]
+    max_depth = np.max(depth_im)
+    view_frust_pts = np.array(
+        [
+            (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[0, 0],
+            (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[1, 1],
+            np.array([0, max_depth, max_depth, max_depth, max_depth]),
+        ]
+    )
+    view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
+    return view_frust_pts
+
+
+def meshwrite(filename, verts, faces, norms, colors):
+    """Save a 3D mesh to a polygon .ply file."""
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (verts.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property float nx\n")
+    ply_file.write("property float ny\n")
+    ply_file.write("property float nz\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("element face %d\n" % (faces.shape[0]))
+    ply_file.write("property list uchar int vertex_index\n")
+    ply_file.write("end_header\n")
+
+    # Write vertex list
+    for i in range(verts.shape[0]):
+        ply_file.write(
+            "%f %f %f %f %f %f %d %d %d\n"
+            % (
+                verts[i, 0],
+                verts[i, 1],
+                verts[i, 2],
+                norms[i, 0],
+                norms[i, 1],
+                norms[i, 2],
+                colors[i, 0],
+                colors[i, 1],
+                colors[i, 2],
+            )
+        )
+
+    # Write face list
+    for i in range(faces.shape[0]):
+        ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
+
+    ply_file.close()
+
+
+def pcwrite(filename, xyzrgb):
+    """Save a point cloud to a polygon .ply file."""
+    xyz = xyzrgb[:, :3]
+    rgb = xyzrgb[:, 3:].astype(np.uint8)
+
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (xyz.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("end_header\n")
+
+    # Write vertex list
+    for i in range(xyz.shape[0]):
+        ply_file.write(
+            "%f %f %f %d %d %d\n"
+            % (
+                xyz[i, 0],
+                xyz[i, 1],
+                xyz[i, 2],
+                rgb[i, 0],
+                rgb[i, 1],
+                rgb[i, 2],
+            )
+        )
diff --git a/monoscene/data/utils/helpers.py b/monoscene/data/utils/helpers.py
new file mode 100644
index 0000000..8da3264
--- /dev/null
+++ b/monoscene/data/utils/helpers.py
@@ -0,0 +1,185 @@
+import numpy as np
+import monoscene.data.utils.fusion as fusion
+import torch
+
+
+def compute_CP_mega_matrix(target, is_binary=False):
+    """
+    Parameters
+    ---------
+    target: (H, W, D)
+        contains voxels semantic labels
+
+    is_binary: bool
+        if True, return binary voxels relations else return 4-way relations
+    """
+    label = target.reshape(-1)
+    label_row = label
+    N = label.shape[0]
+    super_voxel_size = [i//2 for i in target.shape]
+    if is_binary:
+        matrix = np.zeros((2, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
+    else:
+        matrix = np.zeros((4, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8)
+
+    for xx in range(super_voxel_size[0]):
+        for yy in range(super_voxel_size[1]):
+            for zz in range(super_voxel_size[2]):
+                col_idx = xx * (super_voxel_size[1] * super_voxel_size[2]) + yy * super_voxel_size[2] + zz
+                label_col_megas = np.array([
+                    target[xx * 2,     yy * 2,     zz * 2],
+                    target[xx * 2 + 1, yy * 2,     zz * 2],
+                    target[xx * 2,     yy * 2 + 1, zz * 2],
+                    target[xx * 2,     yy * 2,     zz * 2 + 1],
+                    target[xx * 2 + 1, yy * 2 + 1, zz * 2],
+                    target[xx * 2 + 1, yy * 2,     zz * 2 + 1],
+                    target[xx * 2,     yy * 2 + 1, zz * 2 + 1],
+                    target[xx * 2 + 1, yy * 2 + 1, zz * 2 + 1],
+                ])
+                label_col_megas = label_col_megas[label_col_megas != 255]
+                for label_col_mega in label_col_megas:
+                    label_col = np.ones(N)  * label_col_mega
+                    if not is_binary:
+                        matrix[0, (label_row != 255) & (label_col == label_row) & (label_col != 0), col_idx] = 1.0 # non non same
+                        matrix[1, (label_row != 255) & (label_col != label_row) & (label_col != 0) & (label_row != 0), col_idx] = 1.0 # non non diff
+                        matrix[2, (label_row != 255) & (label_row == label_col) & (label_col == 0), col_idx] = 1.0 # empty empty
+                        matrix[3, (label_row != 255) & (label_row != label_col) & ((label_row == 0) | (label_col == 0)), col_idx] = 1.0 # nonempty empty
+                    else:
+                        matrix[0, (label_row != 255) & (label_col != label_row), col_idx] = 1.0 # diff
+                        matrix[1, (label_row != 255) & (label_col == label_row), col_idx] = 1.0 # same
+    return matrix
+
+
+def vox2pix(cam_E, cam_k, 
+            vox_origin, voxel_size, 
+            img_W, img_H, 
+            scene_size):
+    """
+    compute the 2D projection of voxels centroids
+    
+    Parameters:
+    ----------
+    cam_E: 4x4
+       =camera pose in case of NYUv2 dataset
+       =Transformation from camera to lidar coordinate in case of SemKITTI
+    cam_k: 3x3
+        camera intrinsics
+    vox_origin: (3,)
+        world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
+    img_W: int
+        image width
+    img_H: int
+        image height
+    scene_size: (3,)
+        scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
+    
+    Returns
+    -------
+    projected_pix: (N, 2)
+        Projected 2D positions of voxels
+    fov_mask: (N,)
+        Voxels mask indice voxels inside image's FOV 
+    sensor_distance: (N,)
+        Voxels'distance to the sensor in meter
+    """
+    # Compute the x, y, z bounding of the scene in meter
+    vol_bnds = np.zeros((3,2))
+    vol_bnds[:,0] = vox_origin
+    vol_bnds[:,1] = vox_origin + np.array(scene_size)
+
+    # Compute the voxels centroids in lidar cooridnates
+    vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
+    xv, yv, zv = np.meshgrid(
+            range(vol_dim[0]),
+            range(vol_dim[1]),
+            range(vol_dim[2]),
+            indexing='ij'
+          )
+    vox_coords = np.concatenate([
+            xv.reshape(1,-1),
+            yv.reshape(1,-1),
+            zv.reshape(1,-1)
+          ], axis=0).astype(int).T
+
+    # Project voxels'centroid from lidar coordinates to camera coordinates
+    cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
+    cam_pts = fusion.rigid_transform(cam_pts, cam_E)
+
+    # Project camera coordinates to pixel positions
+    projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
+    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+
+    # Eliminate pixels outside view frustum
+    sensor_distance = cam_pts[:, 2]
+    fov_mask = np.logical_and(pix_x >= 0,
+                np.logical_and(pix_x < img_W,
+                np.logical_and(pix_y >= 0,
+                np.logical_and(pix_y < img_H,
+                sensor_distance > 0))))
+
+
+    return projected_pix, fov_mask, sensor_distance
+
+
+def compute_local_frustum(pix_x, pix_y, min_x, max_x, min_y, max_y, sensor_distance):
+    valid_pix = np.logical_and(pix_x >= min_x,
+                np.logical_and(pix_x < max_x,
+                np.logical_and(pix_y >= min_y,
+                np.logical_and(pix_y < max_y,
+                sensor_distance > 0))))
+    return valid_pix
+
+def compute_local_frustums(projected_pix, sensor_distance, target, img_W, img_H, dataset, n_classes, size=4):
+    """
+    Compute the local frustums mask and their class frequencies
+    
+    Parameters:
+    ----------
+    projected_pix: (N, 2)
+        2D projected pix of all voxels
+    sensor_distance: (N,)
+        Distance of the camera sensor to voxels
+    target: (H, W, D)
+        Voxelized sematic labels
+    img_W: int
+        Image width
+    img_H: int
+        Image height
+    dataset: str
+        ="NYU" or "kitti" (for both SemKITTI and KITTI-360)
+    n_classes: int
+        Number of classes (12 for NYU and 20 for SemKITTI)
+    size: int
+        determine the number of local frustums i.e. size * size
+    
+    Returns
+    -------
+    frustums_masks: (n_frustums, N)
+        List of frustums_masks, each indicates the belonging voxels  
+    frustums_class_dists: (n_frustums, n_classes)
+        Contains the class frequencies in each frustum
+    """
+    H, W, D = target.shape
+    ranges = [(i * 1.0/size, (i * 1.0 + 1)/size) for i in range(size)]
+    local_frustum_masks = []
+    local_frustum_class_dists = []
+    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+    for y in ranges:
+        for x in ranges:
+            start_x = x[0] * img_W
+            end_x = x[1] * img_W
+            start_y = y[0] * img_H
+            end_y = y[1] * img_H
+            local_frustum = compute_local_frustum(pix_x, pix_y, start_x, end_x, start_y, end_y, sensor_distance)
+            if dataset == "NYU":
+                mask = (target != 255) & np.moveaxis(local_frustum.reshape(60, 60, 36), [0, 1, 2], [0, 2, 1])
+            elif dataset == "kitti":
+                mask = (target != 255) & local_frustum.reshape(H, W, D)
+
+            local_frustum_masks.append(mask)
+            classes, cnts = np.unique(target[mask], return_counts=True)
+            class_counts = np.zeros(n_classes)
+            class_counts[classes.astype(int)] = cnts
+            local_frustum_class_dists.append(class_counts)
+    frustums_masks, frustums_class_dists = np.array(local_frustum_masks), np.array(local_frustum_class_dists)
+    return frustums_masks, frustums_class_dists
diff --git a/monoscene/data/utils/torch_util.py b/monoscene/data/utils/torch_util.py
new file mode 100644
index 0000000..a7d7ffd
--- /dev/null
+++ b/monoscene/data/utils/torch_util.py
@@ -0,0 +1,15 @@
+import numpy as np
+import torch
+
+
+def worker_init_fn(worker_id):
+    """The function is designed for pytorch multi-process dataloader.
+    Note that we use the pytorch random generator to generate a base_seed.
+    Please try to be consistent.
+
+    References:
+        https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
+
+    """
+    base_seed = torch.IntTensor(1).random_().item()
+    np.random.seed(base_seed + worker_id)
diff --git a/monoscene/loss/CRP_loss.py b/monoscene/loss/CRP_loss.py
new file mode 100644
index 0000000..b1e2797
--- /dev/null
+++ b/monoscene/loss/CRP_loss.py
@@ -0,0 +1,24 @@
+import torch
+
+
+def compute_super_CP_multilabel_loss(pred_logits, CP_mega_matrices):
+    logits = []
+    labels = []
+    bs, n_relations, _, _ = pred_logits.shape
+    for i in range(bs):
+        pred_logit = pred_logits[i, :, :, :].permute(
+            0, 2, 1
+        )  # n_relations, N, n_mega_voxels
+        CP_mega_matrix = CP_mega_matrices[i]  # n_relations, N, n_mega_voxels
+        logits.append(pred_logit.reshape(n_relations, -1))
+        labels.append(CP_mega_matrix.reshape(n_relations, -1))
+
+    logits = torch.cat(logits, dim=1).T  # M, 4
+    labels = torch.cat(labels, dim=1).T  # M, 4
+
+    cnt_neg = (labels == 0).sum(0)
+    cnt_pos = labels.sum(0)
+    pos_weight = cnt_neg / cnt_pos
+    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+    loss_bce = criterion(logits, labels.float())
+    return loss_bce
diff --git a/monoscene/loss/sscMetrics.py b/monoscene/loss/sscMetrics.py
new file mode 100644
index 0000000..f77be70
--- /dev/null
+++ b/monoscene/loss/sscMetrics.py
@@ -0,0 +1,204 @@
+"""
+Part of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py
+"""
+import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+
+
+def get_iou(iou_sum, cnt_class):
+    _C = iou_sum.shape[0]  # 12
+    iou = np.zeros(_C, dtype=np.float32)  # iou for each class
+    for idx in range(_C):
+        iou[idx] = iou_sum[idx] / cnt_class[idx] if cnt_class[idx] else 0
+
+    mean_iou = np.sum(iou[1:]) / np.count_nonzero(cnt_class[1:])
+    return iou, mean_iou
+
+
+def get_accuracy(predict, target, weight=None):  # 0.05s
+    _bs = predict.shape[0]  # batch size
+    _C = predict.shape[1]  # _C = 12
+    target = np.int32(target)
+    target = target.reshape(_bs, -1)  # (_bs, 60*36*60) 129600
+    predict = predict.reshape(_bs, _C, -1)  # (_bs, _C, 60*36*60)
+    predict = np.argmax(
+        predict, axis=1
+    )  # one-hot: _bs x _C x 60*36*60 -->  label: _bs x 60*36*60.
+
+    correct = predict == target  # (_bs, 129600)
+    if weight:  # 0.04s, add class weights
+        weight_k = np.ones(target.shape)
+        for i in range(_bs):
+            for n in range(target.shape[1]):
+                idx = 0 if target[i, n] == 255 else target[i, n]
+                weight_k[i, n] = weight[idx]
+        correct = correct * weight_k
+    acc = correct.sum() / correct.size
+    return acc
+
+
+class SSCMetrics:
+    def __init__(self, n_classes):
+        self.n_classes = n_classes
+        self.reset()
+
+    def hist_info(self, n_cl, pred, gt):
+        assert pred.shape == gt.shape
+        k = (gt >= 0) & (gt < n_cl)  # exclude 255
+        labeled = np.sum(k)
+        correct = np.sum((pred[k] == gt[k]))
+
+        return (
+            np.bincount(
+                n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
+            ).reshape(n_cl, n_cl),
+            correct,
+            labeled,
+        )
+
+    @staticmethod
+    def compute_score(hist, correct, labeled):
+        iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
+        mean_IU = np.nanmean(iu)
+        mean_IU_no_back = np.nanmean(iu[1:])
+        freq = hist.sum(1) / hist.sum()
+        freq_IU = (iu[freq > 0] * freq[freq > 0]).sum()
+        mean_pixel_acc = correct / labeled if labeled != 0 else 0
+
+        return iu, mean_IU, mean_IU_no_back, mean_pixel_acc
+
+    def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None):
+        self.count += 1
+        mask = y_true != 255
+        if nonempty is not None:
+            mask = mask & nonempty
+        if nonsurface is not None:
+            mask = mask & nonsurface
+        tp, fp, fn = self.get_score_completion(y_pred, y_true, mask)
+
+        self.completion_tp += tp
+        self.completion_fp += fp
+        self.completion_fn += fn
+
+        mask = y_true != 255
+        if nonempty is not None:
+            mask = mask & nonempty
+        tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion(
+            y_pred, y_true, mask
+        )
+        self.tps += tp_sum
+        self.fps += fp_sum
+        self.fns += fn_sum
+
+    def get_stats(self):
+        if self.completion_tp != 0:
+            precision = self.completion_tp / (self.completion_tp + self.completion_fp)
+            recall = self.completion_tp / (self.completion_tp + self.completion_fn)
+            iou = self.completion_tp / (
+                self.completion_tp + self.completion_fp + self.completion_fn
+            )
+        else:
+            precision, recall, iou = 0, 0, 0
+        iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5)
+        return {
+            "precision": precision,
+            "recall": recall,
+            "iou": iou,
+            "iou_ssc": iou_ssc,
+            "iou_ssc_mean": np.mean(iou_ssc[1:]),
+        }
+
+    def reset(self):
+
+        self.completion_tp = 0
+        self.completion_fp = 0
+        self.completion_fn = 0
+        self.tps = np.zeros(self.n_classes)
+        self.fps = np.zeros(self.n_classes)
+        self.fns = np.zeros(self.n_classes)
+
+        self.hist_ssc = np.zeros((self.n_classes, self.n_classes))
+        self.labeled_ssc = 0
+        self.correct_ssc = 0
+
+        self.precision = 0
+        self.recall = 0
+        self.iou = 0
+        self.count = 1e-8
+        self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32)
+        self.cnt_class = np.zeros(self.n_classes, dtype=np.float32)
+
+    def get_score_completion(self, predict, target, nonempty=None):
+        predict = np.copy(predict)
+        target = np.copy(target)
+
+        """for scene completion, treat the task as two-classes problem, just empty or occupancy"""
+        _bs = predict.shape[0]  # batch size
+        # ---- ignore
+        predict[target == 255] = 0
+        target[target == 255] = 0
+        # ---- flatten
+        target = target.reshape(_bs, -1)  # (_bs, 129600)
+        predict = predict.reshape(_bs, -1)  # (_bs, _C, 129600), 60*36*60=129600
+        # ---- treat all non-empty object class as one category, set them to label 1
+        b_pred = np.zeros(predict.shape)
+        b_true = np.zeros(target.shape)
+        b_pred[predict > 0] = 1
+        b_true[target > 0] = 1
+        p, r, iou = 0.0, 0.0, 0.0
+        tp_sum, fp_sum, fn_sum = 0, 0, 0
+        for idx in range(_bs):
+            y_true = b_true[idx, :]  # GT
+            y_pred = b_pred[idx, :]
+            if nonempty is not None:
+                nonempty_idx = nonempty[idx, :].reshape(-1)
+                y_true = y_true[nonempty_idx == 1]
+                y_pred = y_pred[nonempty_idx == 1]
+
+            tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size
+            fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size
+            fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size
+            tp_sum += tp
+            fp_sum += fp
+            fn_sum += fn
+        return tp_sum, fp_sum, fn_sum
+
+    def get_score_semantic_and_completion(self, predict, target, nonempty=None):
+        target = np.copy(target)
+        predict = np.copy(predict)
+        _bs = predict.shape[0]  # batch size
+        _C = self.n_classes  # _C = 12
+        # ---- ignore
+        predict[target == 255] = 0
+        target[target == 255] = 0
+        # ---- flatten
+        target = target.reshape(_bs, -1)  # (_bs, 129600)
+        predict = predict.reshape(_bs, -1)  # (_bs, 129600), 60*36*60=129600
+
+        cnt_class = np.zeros(_C, dtype=np.int32)  # count for each class
+        iou_sum = np.zeros(_C, dtype=np.float32)  # sum of iou for each class
+        tp_sum = np.zeros(_C, dtype=np.int32)  # tp
+        fp_sum = np.zeros(_C, dtype=np.int32)  # fp
+        fn_sum = np.zeros(_C, dtype=np.int32)  # fn
+
+        for idx in range(_bs):
+            y_true = target[idx, :]  # GT
+            y_pred = predict[idx, :]
+            if nonempty is not None:
+                nonempty_idx = nonempty[idx, :].reshape(-1)
+                y_pred = y_pred[
+                    np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
+                ]
+                y_true = y_true[
+                    np.where(np.logical_and(nonempty_idx == 1, y_true != 255))
+                ]
+            for j in range(_C):  # for each class
+                tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size
+                fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size
+                fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size
+
+                tp_sum[j] += tp
+                fp_sum[j] += fp
+                fn_sum[j] += fn
+
+        return tp_sum, fp_sum, fn_sum
diff --git a/monoscene/loss/ssc_loss.py b/monoscene/loss/ssc_loss.py
new file mode 100644
index 0000000..941d2ee
--- /dev/null
+++ b/monoscene/loss/ssc_loss.py
@@ -0,0 +1,99 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def KL_sep(p, target):
+    """
+    KL divergence on nonzeros classes
+    """
+    nonzeros = target != 0
+    nonzero_p = p[nonzeros]
+    kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum")
+    return kl_term
+
+
+def geo_scal_loss(pred, ssc_target):
+
+    # Get softmax probabilities
+    pred = F.softmax(pred, dim=1)
+
+    # Compute empty and nonempty probabilities
+    empty_probs = pred[:, 0, :, :, :]
+    nonempty_probs = 1 - empty_probs
+
+    # Remove unknown voxels
+    mask = ssc_target != 255
+    nonempty_target = ssc_target != 0
+    nonempty_target = nonempty_target[mask].float()
+    nonempty_probs = nonempty_probs[mask]
+    empty_probs = empty_probs[mask]
+
+    intersection = (nonempty_target * nonempty_probs).sum()
+    precision = intersection / nonempty_probs.sum()
+    recall = intersection / nonempty_target.sum()
+    spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum()
+    return (
+        F.binary_cross_entropy(precision, torch.ones_like(precision))
+        + F.binary_cross_entropy(recall, torch.ones_like(recall))
+        + F.binary_cross_entropy(spec, torch.ones_like(spec))
+    )
+
+
+def sem_scal_loss(pred, ssc_target):
+    # Get softmax probabilities
+    pred = F.softmax(pred, dim=1)
+    loss = 0
+    count = 0
+    mask = ssc_target != 255
+    n_classes = pred.shape[1]
+    for i in range(0, n_classes):
+
+        # Get probability of class i
+        p = pred[:, i, :, :, :]
+
+        # Remove unknown voxels
+        target_ori = ssc_target
+        p = p[mask]
+        target = ssc_target[mask]
+
+        completion_target = torch.ones_like(target)
+        completion_target[target != i] = 0
+        completion_target_ori = torch.ones_like(target_ori).float()
+        completion_target_ori[target_ori != i] = 0
+        if torch.sum(completion_target) > 0:
+            count += 1.0
+            nominator = torch.sum(p * completion_target)
+            loss_class = 0
+            if torch.sum(p) > 0:
+                precision = nominator / (torch.sum(p))
+                loss_precision = F.binary_cross_entropy(
+                    precision, torch.ones_like(precision)
+                )
+                loss_class += loss_precision
+            if torch.sum(completion_target) > 0:
+                recall = nominator / (torch.sum(completion_target))
+                loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall))
+                loss_class += loss_recall
+            if torch.sum(1 - completion_target) > 0:
+                specificity = torch.sum((1 - p) * (1 - completion_target)) / (
+                    torch.sum(1 - completion_target)
+                )
+                loss_specificity = F.binary_cross_entropy(
+                    specificity, torch.ones_like(specificity)
+                )
+                loss_class += loss_specificity
+            loss += loss_class
+    return loss / count
+
+
+def CE_ssc_loss(pred, target, class_weights):
+    """
+    :param: prediction: the predicted tensor, must be [BS, C, H, W, D]
+    """
+    criterion = nn.CrossEntropyLoss(
+        weight=class_weights, ignore_index=255, reduction="mean"
+    )
+    loss = criterion(pred, target.long())
+
+    return loss
diff --git a/monoscene/models/CRP3D.py b/monoscene/models/CRP3D.py
new file mode 100644
index 0000000..682a00c
--- /dev/null
+++ b/monoscene/models/CRP3D.py
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+from monoscene.models.modules import (
+    Process,
+    ASPP,
+)
+
+
+class CPMegaVoxels(nn.Module):
+    def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
+        super().__init__()
+        self.size = size
+        self.n_relations = n_relations
+        print("n_relations", self.n_relations)
+        self.flatten_size = size[0] * size[1] * size[2]
+        self.feature = feature
+        self.context_feature = feature * 2
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+        padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
+        
+        self.mega_context = nn.Sequential(
+            nn.Conv3d(
+                feature, self.context_feature, stride=2, padding=padding, kernel_size=3
+            ),
+        )
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+
+        self.context_prior_logits = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv3d(
+                        self.feature,
+                        self.flatten_context_size,
+                        padding=0,
+                        kernel_size=1,
+                    ),
+                )
+                for i in range(n_relations)
+            ]
+        )
+        self.aspp = ASPP(feature, [1, 2, 3])
+
+        self.resize = nn.Sequential(
+            nn.Conv3d(
+                self.context_feature * self.n_relations + feature,
+                feature,
+                kernel_size=1,
+                padding=0,
+                bias=False,
+            ),
+            Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
+        )
+
+    def forward(self, input):
+        ret = {}
+        bs = input.shape[0]
+
+        x_agg = self.aspp(input)
+
+        # get the mega context
+        x_mega_context_raw = self.mega_context(x_agg)
+        x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
+        x_mega_context = x_mega_context.permute(0, 2, 1)
+
+        # get context prior map
+        x_context_prior_logits = []
+        x_context_rels = []
+        for rel in range(self.n_relations):
+
+            # Compute the relation matrices
+            x_context_prior_logit = self.context_prior_logits[rel](x_agg)
+            x_context_prior_logit = x_context_prior_logit.reshape(
+                bs, self.flatten_context_size, self.flatten_size
+            )
+            x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
+
+            x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
+            x_context_prior = torch.sigmoid(x_context_prior_logit)
+
+            # Multiply the relation matrices with the mega context to gather context features
+            x_context_rel = torch.bmm(x_context_prior, x_mega_context)  # bs, N, f
+            x_context_rels.append(x_context_rel)
+
+        x_context = torch.cat(x_context_rels, dim=2)
+        x_context = x_context.permute(0, 2, 1)
+        x_context = x_context.reshape(
+            bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
+        )
+
+        x = torch.cat([input, x_context], dim=1)
+        x = self.resize(x)
+
+        x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
+        ret["P_logits"] = x_context_prior_logits
+        ret["x"] = x
+
+        return ret
diff --git a/monoscene/models/DDR.py b/monoscene/models/DDR.py
new file mode 100644
index 0000000..0d0928c
--- /dev/null
+++ b/monoscene/models/DDR.py
@@ -0,0 +1,139 @@
+"""
+Most of the code in this file is taken from https://github.com/waterljwant/SSC/blob/master/models/DDR.py
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SimpleRB(nn.Module):
+    def __init__(self, in_channel, norm_layer, bn_momentum):
+        super(SimpleRB, self).__init__()
+        self.path = nn.Sequential(
+            nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+            norm_layer(in_channel, momentum=bn_momentum),
+            nn.ReLU(),
+            nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+            norm_layer(in_channel, momentum=bn_momentum),
+        )
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        conv_path = self.path(x)
+        out = residual + conv_path
+        out = self.relu(out)
+        return out
+
+
+"""
+3D Residual Block，3x3x3 conv ==> 3 smaller 3D conv, refered from DDRNet
+"""
+
+
+class Bottleneck3D(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        norm_layer,
+        stride=1,
+        dilation=[1, 1, 1],
+        expansion=4,
+        downsample=None,
+        fist_dilation=1,
+        multi_grid=1,
+        bn_momentum=0.0003,
+    ):
+        super(Bottleneck3D, self).__init__()
+        # often，planes = inplanes // 4
+        self.expansion = expansion
+        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(planes, momentum=bn_momentum)
+        self.conv2 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(1, 1, 3),
+            stride=(1, 1, stride),
+            dilation=(1, 1, dilation[0]),
+            padding=(0, 0, dilation[0]),
+            bias=False,
+        )
+        self.bn2 = norm_layer(planes, momentum=bn_momentum)
+        self.conv3 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(1, 3, 1),
+            stride=(1, stride, 1),
+            dilation=(1, dilation[1], 1),
+            padding=(0, dilation[1], 0),
+            bias=False,
+        )
+        self.bn3 = norm_layer(planes, momentum=bn_momentum)
+        self.conv4 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(3, 1, 1),
+            stride=(stride, 1, 1),
+            dilation=(dilation[2], 1, 1),
+            padding=(dilation[2], 0, 0),
+            bias=False,
+        )
+        self.bn4 = norm_layer(planes, momentum=bn_momentum)
+        self.conv5 = nn.Conv3d(
+            planes, planes * self.expansion, kernel_size=(1, 1, 1), bias=False
+        )
+        self.bn5 = norm_layer(planes * self.expansion, momentum=bn_momentum)
+
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+        self.downsample2 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(1, stride, 1), stride=(1, stride, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+        self.downsample3 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+        self.downsample4 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+
+    def forward(self, x):
+        residual = x
+
+        out1 = self.relu(self.bn1(self.conv1(x)))
+        out2 = self.bn2(self.conv2(out1))
+        out2_relu = self.relu(out2)
+
+        out3 = self.bn3(self.conv3(out2_relu))
+        if self.stride != 1:
+            out2 = self.downsample2(out2)
+        out3 = out3 + out2
+        out3_relu = self.relu(out3)
+
+        out4 = self.bn4(self.conv4(out3_relu))
+        if self.stride != 1:
+            out2 = self.downsample3(out2)
+            out3 = self.downsample4(out3)
+        out4 = out4 + out2 + out3
+
+        out4_relu = self.relu(out4)
+        out5 = self.bn5(self.conv5(out4_relu))
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = out5 + residual
+        out_relu = self.relu(out)
+
+        return out_relu
diff --git a/monoscene/models/flosp.py b/monoscene/models/flosp.py
new file mode 100644
index 0000000..2d50219
--- /dev/null
+++ b/monoscene/models/flosp.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+
+class FLoSP(nn.Module):
+    def __init__(self, scene_size, dataset, project_scale):
+        super().__init__()
+        self.scene_size = scene_size
+        self.dataset = dataset
+        self.project_scale = project_scale
+
+    def forward(self, x2d, projected_pix, fov_mask):
+        c, h, w = x2d.shape
+
+        src = x2d.view(c, -1)
+        zeros_vec = torch.zeros(c, 1).type_as(src)
+        src = torch.cat([src, zeros_vec], 1)
+
+        pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+        img_indices = pix_y * w + pix_x
+        img_indices[~fov_mask] = h * w
+        img_indices = img_indices.expand(c, -1).long()  # c, HWD
+        src_feature = torch.gather(src, 1, img_indices)
+
+        if self.dataset == "NYU":
+            x3d = src_feature.reshape(
+                c,
+                self.scene_size[0] // self.project_scale,
+                self.scene_size[2] // self.project_scale,
+                self.scene_size[1] // self.project_scale,
+            )
+            x3d = x3d.permute(0, 1, 3, 2)
+        elif self.dataset == "kitti":
+            x3d = src_feature.reshape(
+                c,
+                self.scene_size[0] // self.project_scale,
+                self.scene_size[1] // self.project_scale,
+                self.scene_size[2] // self.project_scale,
+            )
+
+        return x3d
diff --git a/monoscene/models/modules.py b/monoscene/models/modules.py
new file mode 100644
index 0000000..42a892c
--- /dev/null
+++ b/monoscene/models/modules.py
@@ -0,0 +1,194 @@
+import torch
+import torch.nn as nn
+from monoscene.models.DDR import Bottleneck3D
+
+
+class ASPP(nn.Module):
+    """
+    ASPP 3D
+    Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+
+    def __init__(self, planes, dilations_conv_list):
+        super().__init__()
+
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+
+    def forward(self, x_in):
+
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+
+        return x_in
+
+
+class SegmentationHead(nn.Module):
+    """
+    3D Segmentation heads to retrieve semantic segmentation at each scale.
+    Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
+    Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+
+    def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
+        super().__init__()
+
+        # First convolution
+        self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
+
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+
+        self.conv_classes = nn.Conv3d(
+            planes, nbr_classes, kernel_size=3, padding=1, stride=1
+        )
+
+    def forward(self, x_in):
+
+        # Convolution to go from inplanes to planes features...
+        x_in = self.relu(self.conv0(x_in))
+
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+
+        x_in = self.conv_classes(x_in)
+
+        return x_in
+
+
+class ProcessKitti(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Process(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
+        super(Upsample, self).__init__()
+        self.main = nn.Sequential(
+            nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                dilation=1,
+                output_padding=1,
+            ),
+            norm_layer(out_channels, momentum=bn_momentum),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Downsample(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
+        super(Downsample, self).__init__()
+        self.main = Bottleneck3D(
+            feature,
+            feature // 4,
+            bn_momentum=bn_momentum,
+            expansion=expansion,
+            stride=2,
+            downsample=nn.Sequential(
+                nn.AvgPool3d(kernel_size=2, stride=2),
+                nn.Conv3d(
+                    feature,
+                    int(feature * expansion / 4),
+                    kernel_size=1,
+                    stride=1,
+                    bias=False,
+                ),
+                norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
+            ),
+            norm_layer=norm_layer,
+        )
+
+    def forward(self, x):
+        return self.main(x)
diff --git a/monoscene/models/monoscene.py b/monoscene/models/monoscene.py
new file mode 100644
index 0000000..d85730d
--- /dev/null
+++ b/monoscene/models/monoscene.py
@@ -0,0 +1,290 @@
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from monoscene.models.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.models.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.loss.sscMetrics import SSCMetrics
+from monoscene.loss.ssc_loss import sem_scal_loss, CE_ssc_loss, KL_sep, geo_scal_loss
+from monoscene.models.flosp import FLoSP
+from monoscene.loss.CRP_loss import compute_super_CP_multilabel_loss
+import numpy as np
+import torch.nn.functional as F
+from monoscene.models.unet2d import UNet2D
+from torch.optim.lr_scheduler import MultiStepLR
+
+
+class MonoScene(pl.LightningModule):
+    def __init__(
+        self,
+        n_classes,
+        class_names,
+        feature,
+        class_weights,
+        project_scale,
+        full_scene_size,
+        dataset,
+        n_relations=4,
+        context_prior=True,
+        fp_loss=True,
+        project_res=[],
+        frustum_size=4,
+        relation_loss=False,
+        CE_ssc_loss=True,
+        geo_scal_loss=True,
+        sem_scal_loss=True,
+        lr=1e-4,
+        weight_decay=1e-4,
+    ):
+        super().__init__()
+
+        self.project_res = project_res
+        self.fp_loss = fp_loss
+        self.dataset = dataset
+        self.context_prior = context_prior
+        self.frustum_size = frustum_size
+        self.class_names = class_names
+        self.relation_loss = relation_loss
+        self.CE_ssc_loss = CE_ssc_loss
+        self.sem_scal_loss = sem_scal_loss
+        self.geo_scal_loss = geo_scal_loss
+        self.project_scale = project_scale
+        self.class_weights = class_weights
+        self.lr = lr
+        self.weight_decay = weight_decay
+
+        self.projects = {}
+        self.scale_2ds = [1, 2, 4, 8]  # 2D scales
+        for scale_2d in self.scale_2ds:
+            self.projects[str(scale_2d)] = FLoSP(
+                full_scene_size, project_scale=self.project_scale, dataset=self.dataset
+            )
+        self.projects = nn.ModuleDict(self.projects)
+
+        self.n_classes = n_classes
+        if self.dataset == "NYU":
+            self.net_3d_decoder = UNet3DNYU(
+                self.n_classes,
+                nn.BatchNorm3d,
+                n_relations=n_relations,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        elif self.dataset == "kitti":
+            self.net_3d_decoder = UNet3DKitti(
+                self.n_classes,
+                nn.BatchNorm3d,
+                project_scale=project_scale,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
+
+        # log hyperparameters
+        self.save_hyperparameters()
+
+        self.train_metrics = SSCMetrics(self.n_classes)
+        self.val_metrics = SSCMetrics(self.n_classes)
+        self.test_metrics = SSCMetrics(self.n_classes)
+
+    def forward(self, batch):
+
+        img = batch["img"]
+        bs = len(img)
+
+        out = {}
+
+        x_rgb = self.net_rgb(img)
+
+        x3ds = []
+        for i in range(bs):
+            x3d = None
+            for scale_2d in self.project_res:
+
+                # project features at each 2D scale to target 3D scale
+                scale_2d = int(scale_2d)
+                projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda()
+                fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda()
+
+                # Sum all the 3D features
+                if x3d is None:
+                    x3d = self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+                else:
+                    x3d += self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+            x3ds.append(x3d)
+
+        input_dict = {
+            "x3d": torch.stack(x3ds),
+        }
+
+        out = self.net_3d_decoder(input_dict)
+
+        return out
+
+    def step(self, batch, step_type, metric):
+        bs = len(batch["img"])
+        loss = 0
+        out_dict = self(batch)
+        ssc_pred = out_dict["ssc_logit"]
+        target = batch["target"]
+
+        if self.context_prior:
+            P_logits = out_dict["P_logits"]
+            CP_mega_matrices = batch["CP_mega_matrices"]
+
+            if self.relation_loss:
+                loss_rel_ce = compute_super_CP_multilabel_loss(
+                    P_logits, CP_mega_matrices
+                )
+                loss += loss_rel_ce
+                self.log(
+                    step_type + "/loss_relation_ce_super",
+                    loss_rel_ce.detach(),
+                    on_epoch=True,
+                    sync_dist=True,
+                )
+
+        class_weight = self.class_weights.type_as(batch["img"])
+        if self.CE_ssc_loss:
+            loss_ssc = CE_ssc_loss(ssc_pred, target, class_weight)
+            loss += loss_ssc
+            self.log(
+                step_type + "/loss_ssc",
+                loss_ssc.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+
+        if self.sem_scal_loss:
+            loss_sem_scal = sem_scal_loss(ssc_pred, target)
+            loss += loss_sem_scal
+            self.log(
+                step_type + "/loss_sem_scal",
+                loss_sem_scal.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+
+        if self.geo_scal_loss:
+            loss_geo_scal = geo_scal_loss(ssc_pred, target)
+            loss += loss_geo_scal
+            self.log(
+                step_type + "/loss_geo_scal",
+                loss_geo_scal.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+
+        if self.fp_loss and step_type != "test":
+            frustums_masks = torch.stack(batch["frustums_masks"])
+            frustums_class_dists = torch.stack(
+                batch["frustums_class_dists"]
+            ).float()  # (bs, n_frustums, n_classes)
+            n_frustums = frustums_class_dists.shape[1]
+
+            pred_prob = F.softmax(ssc_pred, dim=1)
+            batch_cnt = frustums_class_dists.sum(0)  # (n_frustums, n_classes)
+
+            frustum_loss = 0
+            frustum_nonempty = 0
+            for frus in range(n_frustums):
+                frustum_mask = frustums_masks[:, frus, :, :, :].unsqueeze(1).float()
+                prob = frustum_mask * pred_prob  # bs, n_classes, H, W, D
+                prob = prob.reshape(bs, self.n_classes, -1).permute(1, 0, 2)
+                prob = prob.reshape(self.n_classes, -1)
+                cum_prob = prob.sum(dim=1)  # n_classes
+
+                total_cnt = torch.sum(batch_cnt[frus])
+                total_prob = prob.sum()
+                if total_prob > 0 and total_cnt > 0:
+                    frustum_target_proportion = batch_cnt[frus] / total_cnt
+                    cum_prob = cum_prob / total_prob  # n_classes
+                    frustum_loss_i = KL_sep(cum_prob, frustum_target_proportion)
+                    frustum_loss += frustum_loss_i
+                    frustum_nonempty += 1
+            frustum_loss = frustum_loss / frustum_nonempty
+            loss += frustum_loss
+            self.log(
+                step_type + "/loss_frustums",
+                frustum_loss.detach(),
+                on_epoch=True,
+                sync_dist=True,
+            )
+
+        y_true = target.cpu().numpy()
+        y_pred = ssc_pred.detach().cpu().numpy()
+        y_pred = np.argmax(y_pred, axis=1)
+        metric.add_batch(y_pred, y_true)
+
+        self.log(step_type + "/loss", loss.detach(), on_epoch=True, sync_dist=True)
+
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        return self.step(batch, "train", self.train_metrics)
+
+    def validation_step(self, batch, batch_idx):
+        self.step(batch, "val", self.val_metrics)
+
+    def validation_epoch_end(self, outputs):
+        metric_list = [("train", self.train_metrics), ("val", self.val_metrics)]
+
+        for prefix, metric in metric_list:
+            stats = metric.get_stats()
+            for i, class_name in enumerate(self.class_names):
+                self.log(
+                    "{}_SemIoU/{}".format(prefix, class_name),
+                    stats["iou_ssc"][i],
+                    sync_dist=True,
+                )
+            self.log("{}/mIoU".format(prefix), stats["iou_ssc_mean"], sync_dist=True)
+            self.log("{}/IoU".format(prefix), stats["iou"], sync_dist=True)
+            self.log("{}/Precision".format(prefix), stats["precision"], sync_dist=True)
+            self.log("{}/Recall".format(prefix), stats["recall"], sync_dist=True)
+            metric.reset()
+
+    def test_step(self, batch, batch_idx):
+        self.step(batch, "test", self.test_metrics)
+
+    def test_epoch_end(self, outputs):
+        classes = self.class_names
+        metric_list = [("test", self.test_metrics)]
+        for prefix, metric in metric_list:
+            print("{}======".format(prefix))
+            stats = metric.get_stats()
+            print(
+                "Precision={:.4f}, Recall={:.4f}, IoU={:.4f}".format(
+                    stats["precision"] * 100, stats["recall"] * 100, stats["iou"] * 100
+                )
+            )
+            print("class IoU: {}, ".format(classes))
+            print(
+                " ".join(["{:.4f}, "] * len(classes)).format(
+                    *(stats["iou_ssc"] * 100).tolist()
+                )
+            )
+            print("mIoU={:.4f}".format(stats["iou_ssc_mean"] * 100))
+            metric.reset()
+
+    def configure_optimizers(self):
+        if self.dataset == "NYU":
+            optimizer = torch.optim.AdamW(
+                self.parameters(), lr=self.lr, weight_decay=self.weight_decay
+            )
+            scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
+            return [optimizer], [scheduler]
+        elif self.dataset == "kitti":
+            optimizer = torch.optim.AdamW(
+                self.parameters(), lr=self.lr, weight_decay=self.weight_decay
+            )
+            scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1)
+            return [optimizer], [scheduler]
diff --git a/monoscene/models/unet2d.py b/monoscene/models/unet2d.py
new file mode 100644
index 0000000..34cd7c8
--- /dev/null
+++ b/monoscene/models/unet2d.py
@@ -0,0 +1,194 @@
+"""
+Code adapted from https://github.com/shariqfarooq123/AdaBins/blob/main/models/unet_adaptive_bins.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class UpSampleBN(nn.Module):
+    def __init__(self, skip_input, output_features):
+        super(UpSampleBN, self).__init__()
+        self._net = nn.Sequential(
+            nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(output_features),
+            nn.LeakyReLU(),
+            nn.Conv2d(
+                output_features, output_features, kernel_size=3, stride=1, padding=1
+            ),
+            nn.BatchNorm2d(output_features),
+            nn.LeakyReLU(),
+        )
+
+    def forward(self, x, concat_with):
+        up_x = F.interpolate(
+            x,
+            size=(concat_with.shape[2], concat_with.shape[3]),
+            mode="bilinear",
+            align_corners=True,
+        )
+        f = torch.cat([up_x, concat_with], dim=1)
+        return self._net(f)
+
+
+class DecoderBN(nn.Module):
+    def __init__(
+        self, num_features, bottleneck_features, out_feature, use_decoder=True
+    ):
+        super(DecoderBN, self).__init__()
+        features = int(num_features)
+        self.use_decoder = use_decoder
+
+        self.conv2 = nn.Conv2d(
+            bottleneck_features, features, kernel_size=1, stride=1, padding=1
+        )
+
+        self.out_feature_1_1 = out_feature
+        self.out_feature_1_2 = out_feature
+        self.out_feature_1_4 = out_feature
+        self.out_feature_1_8 = out_feature
+        self.out_feature_1_16 = out_feature
+        self.feature_1_16 = features // 2
+        self.feature_1_8 = features // 4
+        self.feature_1_4 = features // 8
+        self.feature_1_2 = features // 16
+        self.feature_1_1 = features // 32
+
+        if self.use_decoder:
+            self.resize_output_1_1 = nn.Conv2d(
+                self.feature_1_1, self.out_feature_1_1, kernel_size=1
+            )
+            self.resize_output_1_2 = nn.Conv2d(
+                self.feature_1_2, self.out_feature_1_2, kernel_size=1
+            )
+            self.resize_output_1_4 = nn.Conv2d(
+                self.feature_1_4, self.out_feature_1_4, kernel_size=1
+            )
+            self.resize_output_1_8 = nn.Conv2d(
+                self.feature_1_8, self.out_feature_1_8, kernel_size=1
+            )
+            self.resize_output_1_16 = nn.Conv2d(
+                self.feature_1_16, self.out_feature_1_16, kernel_size=1
+            )
+
+            self.up16 = UpSampleBN(
+                skip_input=features + 224, output_features=self.feature_1_16
+            )
+            self.up8 = UpSampleBN(
+                skip_input=self.feature_1_16 + 80, output_features=self.feature_1_8
+            )
+            self.up4 = UpSampleBN(
+                skip_input=self.feature_1_8 + 48, output_features=self.feature_1_4
+            )
+            self.up2 = UpSampleBN(
+                skip_input=self.feature_1_4 + 32, output_features=self.feature_1_2
+            )
+            self.up1 = UpSampleBN(
+                skip_input=self.feature_1_2 + 3, output_features=self.feature_1_1
+            )
+        else:
+            self.resize_output_1_1 = nn.Conv2d(3, out_feature, kernel_size=1)
+            self.resize_output_1_2 = nn.Conv2d(32, out_feature * 2, kernel_size=1)
+            self.resize_output_1_4 = nn.Conv2d(48, out_feature * 4, kernel_size=1)
+
+    def forward(self, features):
+        x_block0, x_block1, x_block2, x_block3, x_block4 = (
+            features[4],
+            features[5],
+            features[6],
+            features[8],
+            features[11],
+        )
+        bs = x_block0.shape[0]
+        x_d0 = self.conv2(x_block4)
+
+        if self.use_decoder:
+            x_1_16 = self.up16(x_d0, x_block3)
+            x_1_8 = self.up8(x_1_16, x_block2)
+            x_1_4 = self.up4(x_1_8, x_block1)
+            x_1_2 = self.up2(x_1_4, x_block0)
+            x_1_1 = self.up1(x_1_2, features[0])
+            return {
+                "1_1": self.resize_output_1_1(x_1_1),
+                "1_2": self.resize_output_1_2(x_1_2),
+                "1_4": self.resize_output_1_4(x_1_4),
+                "1_8": self.resize_output_1_8(x_1_8),
+                "1_16": self.resize_output_1_16(x_1_16),
+            }
+        else:
+            x_1_1 = features[0]
+            x_1_2, x_1_4, x_1_8, x_1_16 = (
+                features[4],
+                features[5],
+                features[6],
+                features[8],
+            )
+            x_global = features[-1].reshape(bs, 2560, -1).mean(2)
+            return {
+                "1_1": self.resize_output_1_1(x_1_1),
+                "1_2": self.resize_output_1_2(x_1_2),
+                "1_4": self.resize_output_1_4(x_1_4),
+                "global": x_global,
+            }
+
+
+class Encoder(nn.Module):
+    def __init__(self, backend):
+        super(Encoder, self).__init__()
+        self.original_model = backend
+
+    def forward(self, x):
+        features = [x]
+        for k, v in self.original_model._modules.items():
+            if k == "blocks":
+                for ki, vi in v._modules.items():
+                    features.append(vi(features[-1]))
+            else:
+                features.append(v(features[-1]))
+        return features
+
+
+class UNet2D(nn.Module):
+    def __init__(self, backend, num_features, out_feature, use_decoder=True):
+        super(UNet2D, self).__init__()
+        self.use_decoder = use_decoder
+        self.encoder = Encoder(backend)
+        self.decoder = DecoderBN(
+            out_feature=out_feature,
+            use_decoder=use_decoder,
+            bottleneck_features=num_features,
+            num_features=num_features,
+        )
+
+    def forward(self, x, **kwargs):
+        encoded_feats = self.encoder(x)
+        unet_out = self.decoder(encoded_feats, **kwargs)
+        return unet_out
+
+    def get_encoder_params(self):  # lr/10 learning rate
+        return self.encoder.parameters()
+
+    def get_decoder_params(self):  # lr learning rate
+        return self.decoder.parameters()
+
+    @classmethod
+    def build(cls, **kwargs):
+        basemodel_name = "tf_efficientnet_b7_ns"
+        num_features = 2560
+
+        print("Loading base model ()...".format(basemodel_name), end="")
+        basemodel = torch.hub.load(
+            "rwightman/gen-efficientnet-pytorch", basemodel_name, pretrained=True
+        )
+        print("Done.")
+
+        # Remove last layer
+        print("Removing last two layers (global_pool & classifier).")
+        basemodel.global_pool = nn.Identity()
+        basemodel.classifier = nn.Identity()
+
+        # Building Encoder-Decoder model
+        print("Building Encoder-Decoder model..", end="")
+        m = cls(basemodel, num_features=num_features, **kwargs)
+        print("Done.")
+        return m
diff --git a/monoscene/models/unet3d_kitti.py b/monoscene/models/unet3d_kitti.py
new file mode 100644
index 0000000..a923cbb
--- /dev/null
+++ b/monoscene/models/unet3d_kitti.py
@@ -0,0 +1,88 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monoscene.models.modules import SegmentationHead
+from monoscene.models.CRP3D import CPMegaVoxels
+from monoscene.models.modules import Process, Upsample, Downsample
+
+
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        full_scene_size,
+        feature,
+        project_scale,
+        context_prior=None,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_scale = project_scale
+        self.full_scene_size = full_scene_size
+        self.feature = feature
+
+        size_l1 = (
+            int(self.full_scene_size[0] / project_scale),
+            int(self.full_scene_size[1] / project_scale),
+            int(self.full_scene_size[2] / project_scale),
+        )
+        size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
+        size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
+
+        dilations = [1, 2, 3]
+        self.process_l1 = nn.Sequential(
+            Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature, norm_layer, bn_momentum),
+        )
+        self.process_l2 = nn.Sequential(
+            Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature * 2, norm_layer, bn_momentum),
+        )
+
+        self.up_13_l2 = Upsample(
+            self.feature * 4, self.feature * 2, norm_layer, bn_momentum
+        )
+        self.up_12_l1 = Upsample(
+            self.feature * 2, self.feature, norm_layer, bn_momentum
+        )
+        self.up_l1_lfull = Upsample(
+            self.feature, self.feature // 2, norm_layer, bn_momentum
+        )
+
+        self.ssc_head = SegmentationHead(
+            self.feature // 2, self.feature // 2, class_num, dilations
+        )
+
+        self.context_prior = context_prior
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature * 4, self.feature * 4, size_l3, bn_momentum=bn_momentum
+            )
+
+    def forward(self, input_dict):
+        res = {}
+
+        x3d_l1 = input_dict["x3d"]
+
+        x3d_l2 = self.process_l1(x3d_l1)
+
+        x3d_l3 = self.process_l2(x3d_l2)
+
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_l3)
+            x3d_l3 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+
+        x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
+        x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
+        x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
+
+        ssc_logit_full = self.ssc_head(x3d_up_lfull)
+
+        res["ssc_logit"] = ssc_logit_full
+
+        return res
diff --git a/monoscene/models/unet3d_nyu.py b/monoscene/models/unet3d_nyu.py
new file mode 100644
index 0000000..a052f2f
--- /dev/null
+++ b/monoscene/models/unet3d_nyu.py
@@ -0,0 +1,91 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from monoscene.models.CRP3D import CPMegaVoxels
+from monoscene.models.modules import (
+    Process,
+    Upsample,
+    Downsample,
+    SegmentationHead,
+    ASPP,
+)
+
+
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        feature,
+        full_scene_size,
+        n_relations=4,
+        project_res=[],
+        context_prior=True,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_res = project_res
+
+        self.feature_1_4 = feature
+        self.feature_1_8 = feature * 2
+        self.feature_1_16 = feature * 4
+
+        self.feature_1_16_dec = self.feature_1_16
+        self.feature_1_8_dec = self.feature_1_8
+        self.feature_1_4_dec = self.feature_1_4
+
+        self.process_1_4 = nn.Sequential(
+            Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_4, norm_layer, bn_momentum),
+        )
+        self.process_1_8 = nn.Sequential(
+            Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_8, norm_layer, bn_momentum),
+        )
+        self.up_1_16_1_8 = Upsample(
+            self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
+        )
+        self.up_1_8_1_4 = Upsample(
+            self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
+        )
+        self.ssc_head_1_4 = SegmentationHead(
+            self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
+        )
+
+        self.context_prior = context_prior
+        size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
+
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature_1_16,
+                self.feature_1_16,
+                size_1_16,
+                n_relations=n_relations,
+                bn_momentum=bn_momentum,
+            )
+
+    #
+    def forward(self, input_dict):
+        res = {}
+
+        x3d_1_4 = input_dict["x3d"]
+        x3d_1_8 = self.process_1_4(x3d_1_4)
+        x3d_1_16 = self.process_1_8(x3d_1_8)
+
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_1_16)
+            x3d_1_16 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+
+        x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
+        x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
+
+        ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
+
+        res["ssc_logit"] = ssc_logit_1_4
+
+        return res
diff --git a/monoscene/scripts/eval_monoscene.py b/monoscene/scripts/eval_monoscene.py
new file mode 100644
index 0000000..155b4e0
--- /dev/null
+++ b/monoscene/scripts/eval_monoscene.py
@@ -0,0 +1,71 @@
+from pytorch_lightning import Trainer
+from monoscene.models.monoscene import MonoScene
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+import hydra
+from omegaconf import DictConfig
+import torch
+import os
+from hydra.utils import get_original_cwd
+
+
+@hydra.main(config_name="../config/monoscene.yaml")
+def main(config: DictConfig):
+    torch.set_grad_enabled(False)
+    if config.dataset == "kitti":
+        config.batch_size = 1
+        n_classes = 20
+        feature = 64
+        project_scale = 2
+        full_scene_size = (256, 256, 32)
+        data_module = KittiDataModule(
+            root=config.kitti_root,
+            preprocess_root=config.kitti_preprocess_root,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+
+    elif config.dataset == "NYU":
+        config.batch_size = 2
+        project_scale = 1
+        n_classes = 12
+        feature = 200
+        full_scene_size = (60, 36, 60)
+        data_module = NYUDataModule(
+            root=config.NYU_root,
+            preprocess_root=config.NYU_preprocess_root,
+            n_relations=config.n_relations,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+
+    trainer = Trainer(
+        sync_batchnorm=True, deterministic=True, gpus=config.n_gpus, accelerator="ddp"
+    )
+
+    if config.dataset == "NYU":
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
+        )
+    else:
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
+        )
+
+    model = MonoScene.load_from_checkpoint(
+        model_path,
+        feature=feature,
+        project_scale=project_scale,
+        fp_loss=config.fp_loss,
+        full_scene_size=full_scene_size,
+    )
+    model.eval()
+    data_module.setup()
+    val_dataloader = data_module.val_dataloader()
+    trainer.test(model, test_dataloaders=val_dataloader)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/monoscene/scripts/generate_output.py b/monoscene/scripts/generate_output.py
new file mode 100644
index 0000000..892f8cd
--- /dev/null
+++ b/monoscene/scripts/generate_output.py
@@ -0,0 +1,127 @@
+from pytorch_lightning import Trainer
+from monoscene.models.monoscene import MonoScene
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
+import hydra
+from omegaconf import DictConfig
+import torch
+import numpy as np
+import os
+from hydra.utils import get_original_cwd
+from tqdm import tqdm
+import pickle
+
+
+@hydra.main(config_name="../config/monoscene.yaml")
+def main(config: DictConfig):
+    torch.set_grad_enabled(False)
+
+    # Setup dataloader
+    if config.dataset == "kitti" or config.dataset == "kitti_360":
+        feature = 64
+        project_scale = 2
+        full_scene_size = (256, 256, 32)
+
+        if config.dataset == "kitti":
+            data_module = KittiDataModule(
+                root=config.kitti_root,
+                preprocess_root=config.kitti_preprocess_root,
+                frustum_size=config.frustum_size,
+                batch_size=int(config.batch_size / config.n_gpus),
+                num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+            )
+            data_module.setup()
+            data_loader = data_module.val_dataloader()
+            # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+        else:
+            data_module = Kitti360DataModule(
+                root=config.kitti_360_root,
+                sequences=[config.kitti_360_sequence],
+                n_scans=2000,
+                batch_size=1,
+                num_workers=3,
+            )
+            data_module.setup()
+            data_loader = data_module.dataloader()
+
+    elif config.dataset == "NYU":
+        project_scale = 1
+        feature = 200
+        full_scene_size = (60, 36, 60)
+        data_module = NYUDataModule(
+            root=config.NYU_root,
+            preprocess_root=config.NYU_preprocess_root,
+            n_relations=config.n_relations,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+        data_module.setup()
+        data_loader = data_module.val_dataloader()
+        # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+    else:
+        print("dataset not support")
+
+    # Load pretrained models
+    if config.dataset == "NYU":
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
+        )
+    else:
+        model_path = os.path.join(
+            get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
+        )
+
+    model = MonoScene.load_from_checkpoint(
+        model_path,
+        feature=feature,
+        project_scale=project_scale,
+        fp_loss=config.fp_loss,
+        full_scene_size=full_scene_size,
+    )
+    model.cuda()
+    model.eval()
+
+    # Save prediction and additional data 
+    # to draw the viewing frustum and remove scene outside the room for NYUv2
+    output_path = os.path.join(config.output_path, config.dataset)
+    with torch.no_grad():
+        for batch in tqdm(data_loader):
+            batch["img"] = batch["img"].cuda()
+            pred = model(batch)
+            y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
+            y_pred = np.argmax(y_pred, axis=1)
+            for i in range(config.batch_size):
+                out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
+                if "target" in batch:
+                    out_dict["target"] = (
+                        batch["target"][i].detach().cpu().numpy().astype(np.uint16)
+                    )
+
+                if config.dataset == "NYU":
+                    write_path = output_path
+                    filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
+                    out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
+                    out_dict["vox_origin"] = (
+                        batch["vox_origin"][i].detach().cpu().numpy()
+                    )
+                else:
+                    write_path = os.path.join(output_path, batch["sequence"][i])
+                    filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
+                    out_dict["fov_mask_1"] = (
+                        batch["fov_mask_1"][i].detach().cpu().numpy()
+                    )
+                    out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
+                    out_dict["T_velo_2_cam"] = (
+                        batch["T_velo_2_cam"][i].detach().cpu().numpy()
+                    )
+
+                os.makedirs(write_path, exist_ok=True)
+                with open(filepath, "wb") as handle:
+                    pickle.dump(out_dict, handle)
+                    print("wrote to", filepath)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/monoscene/scripts/train_monoscene.py b/monoscene/scripts/train_monoscene.py
new file mode 100644
index 0000000..8c58d6a
--- /dev/null
+++ b/monoscene/scripts/train_monoscene.py
@@ -0,0 +1,173 @@
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+from monoscene.data.semantic_kitti.params import (
+    semantic_kitti_class_frequencies,
+    kitti_class_names,
+)
+from monoscene.data.NYU.params import (
+    class_weights as NYU_class_weights,
+    NYU_class_names,
+)
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from torch.utils.data.dataloader import DataLoader
+from monoscene.models.monoscene import MonoScene
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+import os
+import hydra
+from omegaconf import DictConfig
+import numpy as np
+import torch
+
+hydra.output_subdir = None
+
+
+@hydra.main(config_name="../config/monoscene.yaml")
+def main(config: DictConfig):
+    exp_name = config.exp_prefix
+    exp_name += "_{}_{}".format(config.dataset, config.run)
+    exp_name += "_FrusSize_{}".format(config.frustum_size)
+    exp_name += "_nRelations{}".format(config.n_relations)
+    exp_name += "_WD{}_lr{}".format(config.weight_decay, config.lr)
+
+    if config.CE_ssc_loss:
+        exp_name += "_CEssc"
+    if config.geo_scal_loss:
+        exp_name += "_geoScalLoss"
+    if config.sem_scal_loss:
+        exp_name += "_semScalLoss"
+    if config.fp_loss:
+        exp_name += "_fpLoss"
+
+    if config.relation_loss:
+        exp_name += "_CERel"
+    if config.context_prior:
+        exp_name += "_3DCRP"
+
+    # Setup dataloaders
+    if config.dataset == "kitti":
+        class_names = kitti_class_names
+        max_epochs = 30
+        logdir = config.kitti_logdir
+        full_scene_size = (256, 256, 32)
+        project_scale = 2
+        feature = 64
+        n_classes = 20
+        class_weights = torch.from_numpy(
+            1 / np.log(semantic_kitti_class_frequencies + 0.001)
+        )
+        data_module = KittiDataModule(
+            root=config.kitti_root,
+            preprocess_root=config.kitti_preprocess_root,
+            frustum_size=config.frustum_size,
+            project_scale=project_scale,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+
+    elif config.dataset == "NYU":
+        class_names = NYU_class_names
+        max_epochs = 30
+        logdir = config.logdir
+        full_scene_size = (60, 36, 60)
+        project_scale = 1
+        feature = 200
+        n_classes = 12
+        class_weights = NYU_class_weights
+        data_module = NYUDataModule(
+            root=config.NYU_root,
+            preprocess_root=config.NYU_preprocess_root,
+            n_relations=config.n_relations,
+            frustum_size=config.frustum_size,
+            batch_size=int(config.batch_size / config.n_gpus),
+            num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+        )
+
+    project_res = ["1"]
+    if config.project_1_2:
+        exp_name += "_Proj_2"
+        project_res.append("2")
+    if config.project_1_4:
+        exp_name += "_4"
+        project_res.append("4")
+    if config.project_1_8:
+        exp_name += "_8"
+        project_res.append("8")
+
+    print(exp_name)
+
+    # Initialize MonoScene model
+    model = MonoScene(
+        dataset=config.dataset,
+        frustum_size=config.frustum_size,
+        project_scale=project_scale,
+        n_relations=config.n_relations,
+        fp_loss=config.fp_loss,
+        feature=feature,
+        full_scene_size=full_scene_size,
+        project_res=project_res,
+        n_classes=n_classes,
+        class_names=class_names,
+        context_prior=config.context_prior,
+        relation_loss=config.relation_loss,
+        CE_ssc_loss=config.CE_ssc_loss,
+        sem_scal_loss=config.sem_scal_loss,
+        geo_scal_loss=config.geo_scal_loss,
+        lr=config.lr,
+        weight_decay=config.weight_decay,
+        class_weights=class_weights,
+    )
+
+    if config.enable_log:
+        logger = TensorBoardLogger(save_dir=logdir, name=exp_name, version="")
+        lr_monitor = LearningRateMonitor(logging_interval="step")
+        checkpoint_callbacks = [
+            ModelCheckpoint(
+                save_last=True,
+                monitor="val/mIoU",
+                save_top_k=1,
+                mode="max",
+                filename="{epoch:03d}-{val/mIoU:.5f}",
+            ),
+            lr_monitor,
+        ]
+    else:
+        logger = False
+        checkpoint_callbacks = False
+
+    model_path = os.path.join(logdir, exp_name, "checkpoints/last.ckpt")
+    if os.path.isfile(model_path):
+        # Continue training from last.ckpt
+        trainer = Trainer(
+            callbacks=checkpoint_callbacks,
+            resume_from_checkpoint=model_path,
+            sync_batchnorm=True,
+            deterministic=True,
+            max_epochs=max_epochs,
+            gpus=config.n_gpus,
+            logger=logger,
+            check_val_every_n_epoch=1,
+            log_every_n_steps=10,
+            flush_logs_every_n_steps=100,
+            accelerator="ddp",
+        )
+    else:
+        # Train from scratch
+        trainer = Trainer(
+            callbacks=checkpoint_callbacks,
+            sync_batchnorm=True,
+            deterministic=True,
+            max_epochs=max_epochs,
+            gpus=config.n_gpus,
+            logger=logger,
+            check_val_every_n_epoch=1,
+            log_every_n_steps=10,
+            flush_logs_every_n_steps=100,
+            accelerator="ddp",
+        )
+
+    trainer.fit(model, data_module)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/monoscene/scripts/visualization/NYU_vis_pred.py b/monoscene/scripts/visualization/NYU_vis_pred.py
new file mode 100644
index 0000000..d451983
--- /dev/null
+++ b/monoscene/scripts/visualization/NYU_vis_pred.py
@@ -0,0 +1,156 @@
+import pickle
+import os
+from omegaconf import DictConfig
+import numpy as np
+import hydra
+from mayavi import mlab
+
+
+def get_grid_coords(dims, resolution):
+    """
+    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+    :return coords_grid: is the center coords of voxels in the grid
+    """
+
+    g_xx = np.arange(0, dims[0] + 1)
+    g_yy = np.arange(0, dims[1] + 1)
+
+    g_zz = np.arange(0, dims[2] + 1)
+
+    # Obtaining the grid with coords...
+    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+    coords_grid = coords_grid.astype(np.float)
+
+    coords_grid = (coords_grid * resolution) + resolution / 2
+
+    temp = np.copy(coords_grid)
+    temp[:, 0] = coords_grid[:, 1]
+    temp[:, 1] = coords_grid[:, 0]
+    coords_grid = np.copy(temp)
+
+    return coords_grid
+
+
+def draw(
+    voxels,
+    cam_pose,
+    vox_origin,
+    voxel_size=0.08,
+    d=0.75,  # 0.75m - determine the size of the mesh representing the camera
+):
+    # Compute the coordinates of the mesh representing camera
+    y = d * 480 / (2 * 518.8579)
+    x = d * 640 / (2 * 518.8579)
+    tri_points = np.array(
+        [
+            [0, 0, 0],
+            [x, y, d],
+            [-x, y, d],
+            [-x, -y, d],
+            [x, -y, d],
+        ]
+    )
+    tri_points = np.hstack([tri_points, np.ones((5, 1))])
+
+    tri_points = (cam_pose @ tri_points.T).T
+    x = tri_points[:, 0] - vox_origin[0]
+    y = tri_points[:, 1] - vox_origin[1]
+    z = tri_points[:, 2] - vox_origin[2]
+    triangles = [
+        (0, 1, 2),
+        (0, 1, 4),
+        (0, 3, 4),
+        (0, 2, 3),
+    ]
+
+    # Compute the voxels coordinates
+    grid_coords = get_grid_coords(
+        [voxels.shape[0], voxels.shape[2], voxels.shape[1]], voxel_size
+    )
+
+    # Attach the predicted class to every voxel
+    grid_coords = np.vstack(
+        (grid_coords.T, np.moveaxis(voxels, [0, 1, 2], [0, 2, 1]).reshape(-1))
+    ).T
+
+    # Remove empty and unknown voxels
+    occupied_voxels = grid_coords[(grid_coords[:, 3] > 0) & (grid_coords[:, 3] < 255)]
+    figure = mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1))
+
+    # Draw the camera
+    mlab.triangular_mesh(
+        x,
+        y,
+        z,
+        triangles,
+        representation="wireframe",
+        color=(0, 0, 0),
+        line_width=5,
+    )
+
+    # Draw occupied voxels
+    plt_plot = mlab.points3d(
+        occupied_voxels[:, 0],
+        occupied_voxels[:, 1],
+        occupied_voxels[:, 2],
+        occupied_voxels[:, 3],
+        colormap="viridis",
+        scale_factor=voxel_size - 0.1 * voxel_size,
+        mode="cube",
+        opacity=1.0,
+        vmin=0,
+        vmax=12,
+    )
+
+    colors = np.array(
+        [
+            [22, 191, 206, 255],
+            [214, 38, 40, 255],
+            [43, 160, 43, 255],
+            [158, 216, 229, 255],
+            [114, 158, 206, 255],
+            [204, 204, 91, 255],
+            [255, 186, 119, 255],
+            [147, 102, 188, 255],
+            [30, 119, 181, 255],
+            [188, 188, 33, 255],
+            [255, 127, 12, 255],
+            [196, 175, 214, 255],
+            [153, 153, 153, 255],
+        ]
+    )
+
+    plt_plot.glyph.scale_mode = "scale_by_vector"
+
+    plt_plot.module_manager.scalar_lut_manager.lut.table = colors
+
+    mlab.show()
+
+
+@hydra.main(config_path=None)
+def main(config: DictConfig):
+    scan = config.file
+
+    with open(scan, "rb") as handle:
+        b = pickle.load(handle)
+
+    cam_pose = b["cam_pose"]
+    vox_origin = b["vox_origin"]
+    gt_scene = b["target"]
+    pred_scene = b["y_pred"]
+    scan = os.path.basename(scan)[:12]
+
+    pred_scene[(gt_scene == 255)] = 255  # only draw scene inside the room
+
+    draw(
+        pred_scene,
+        cam_pose,
+        vox_origin,
+        voxel_size=0.08,
+        d=0.75,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/monoscene/scripts/visualization/kitti_vis_pred.py b/monoscene/scripts/visualization/kitti_vis_pred.py
new file mode 100644
index 0000000..0a34eb5
--- /dev/null
+++ b/monoscene/scripts/visualization/kitti_vis_pred.py
@@ -0,0 +1,201 @@
+# from operator import gt
+import pickle
+import numpy as np
+from omegaconf import DictConfig
+import hydra
+from mayavi import mlab
+
+
+def get_grid_coords(dims, resolution):
+    """
+    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+    :return coords_grid: is the center coords of voxels in the grid
+    """
+
+    g_xx = np.arange(0, dims[0] + 1)
+    g_yy = np.arange(0, dims[1] + 1)
+    sensor_pose = 10
+    g_zz = np.arange(0, dims[2] + 1)
+
+    # Obtaining the grid with coords...
+    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+    coords_grid = coords_grid.astype(np.float)
+
+    coords_grid = (coords_grid * resolution) + resolution / 2
+
+    temp = np.copy(coords_grid)
+    temp[:, 0] = coords_grid[:, 1]
+    temp[:, 1] = coords_grid[:, 0]
+    coords_grid = np.copy(temp)
+
+    return coords_grid, g_xx, g_yy, g_zz
+
+
+def draw(
+    voxels,
+    T_velo_2_cam,
+    vox_origin,
+    fov_mask,
+    img_size,
+    f,
+    voxel_size=0.2,
+    d=7,  # 7m - determine the size of the mesh representing the camera
+):
+    # Compute the coordinates of the mesh representing camera
+    x = d * img_size[0] / (2 * f)
+    y = d * img_size[1] / (2 * f)
+    tri_points = np.array(
+        [
+            [0, 0, 0],
+            [x, y, d],
+            [-x, y, d],
+            [-x, -y, d],
+            [x, -y, d],
+        ]
+    )
+    tri_points = np.hstack([tri_points, np.ones((5, 1))])
+    tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T
+    x = tri_points[:, 0] - vox_origin[0]
+    y = tri_points[:, 1] - vox_origin[1]
+    z = tri_points[:, 2] - vox_origin[2]
+    triangles = [
+        (0, 1, 2),
+        (0, 1, 4),
+        (0, 3, 4),
+        (0, 2, 3),
+    ]
+
+    # Compute the voxels coordinates
+    grid_coords = get_grid_coords(
+        [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
+    )
+
+    # Attach the predicted class to every voxel
+    grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
+
+    # Get the voxels inside FOV
+    fov_grid_coords = grid_coords[fov_mask, :]
+
+    # Get the voxels outside FOV
+    outfov_grid_coords = grid_coords[~fov_mask, :]
+
+    # Remove empty and unknown voxels
+    fov_voxels = fov_grid_coords[
+        (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255)
+    ]
+    outfov_voxels = outfov_grid_coords[
+        (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255)
+    ]
+
+    figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
+
+    # Draw the camera
+    mlab.triangular_mesh(
+        x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5
+    )
+
+    # Draw occupied inside FOV voxels
+    plt_plot_fov = mlab.points3d(
+        fov_voxels[:, 0],
+        fov_voxels[:, 1],
+        fov_voxels[:, 2],
+        fov_voxels[:, 3],
+        colormap="viridis",
+        scale_factor=voxel_size - 0.05 * voxel_size,
+        mode="cube",
+        opacity=1.0,
+        vmin=1,
+        vmax=19,
+    )
+
+    # Draw occupied outside FOV voxels
+    plt_plot_outfov = mlab.points3d(
+        outfov_voxels[:, 0],
+        outfov_voxels[:, 1],
+        outfov_voxels[:, 2],
+        outfov_voxels[:, 3],
+        colormap="viridis",
+        scale_factor=voxel_size - 0.05 * voxel_size,
+        mode="cube",
+        opacity=1.0,
+        vmin=1,
+        vmax=19,
+    )
+
+    colors = np.array(
+        [
+            [100, 150, 245, 255],
+            [100, 230, 245, 255],
+            [30, 60, 150, 255],
+            [80, 30, 180, 255],
+            [100, 80, 250, 255],
+            [255, 30, 30, 255],
+            [255, 40, 200, 255],
+            [150, 30, 90, 255],
+            [255, 0, 255, 255],
+            [255, 150, 255, 255],
+            [75, 0, 75, 255],
+            [175, 0, 75, 255],
+            [255, 200, 0, 255],
+            [255, 120, 50, 255],
+            [0, 175, 0, 255],
+            [135, 60, 0, 255],
+            [150, 240, 80, 255],
+            [255, 240, 150, 255],
+            [255, 0, 0, 255],
+        ]
+    ).astype(np.uint8)
+
+    plt_plot_fov.glyph.scale_mode = "scale_by_vector"
+    plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
+
+    plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
+
+    outfov_colors = colors
+    outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
+    plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
+
+    mlab.show()
+
+
+@hydra.main(config_path=None)
+def main(config: DictConfig):
+    scan = config.file
+    with open(scan, "rb") as handle:
+        b = pickle.load(handle)
+
+    fov_mask_1 = b["fov_mask_1"]
+    T_velo_2_cam = b["T_velo_2_cam"]
+    vox_origin = np.array([0, -25.6, -2])
+
+    y_pred = b["y_pred"]
+
+    if config.dataset == "kitti_360":
+        # Visualize KITTI-360
+        draw(
+            y_pred,
+            T_velo_2_cam,
+            vox_origin,
+            fov_mask_1,
+            voxel_size=0.2,
+            f=552.55426,
+            img_size=(1408, 376),
+            d=7,
+        )
+    else:
+        # Visualize Semantic KITTI
+        draw(
+            y_pred,
+            T_velo_2_cam,
+            vox_origin,
+            fov_mask_1,
+            img_size=(1220, 370),
+            f=707.0912,
+            voxel_size=0.2,
+            d=7,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8cfd27f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+scikit-image==0.18.1
+PyYAML==5.3.1
+tqdm==4.49.0
+scikit-learn==0.24.0
+pytorch-lightning==1.4.0rc0
+opencv-python==4.5.1.48
+hydra-core==1.0.5
+numpy==1.20.3
+numba==0.54.1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..fb0f33c
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,6 @@
+from setuptools import setup
+from setuptools import find_packages
+
+# for install, do: pip install -ve .
+
+setup(name='monoscene', packages=find_packages())
diff --git a/teaser/KITTI-360.gif b/teaser/KITTI-360.gif
new file mode 100644
index 0000000..6e5f8ec
Binary files /dev/null and b/teaser/KITTI-360.gif differ
diff --git a/teaser/NYUv2.gif b/teaser/NYUv2.gif
new file mode 100644
index 0000000..c460201
Binary files /dev/null and b/teaser/NYUv2.gif differ
diff --git a/teaser/SemKITTI.gif b/teaser/SemKITTI.gif
new file mode 100644
index 0000000..4501404
Binary files /dev/null and b/teaser/SemKITTI.gif differ
diff --git a/trained_models/.placeholder b/trained_models/.placeholder
new file mode 100644
index 0000000..e69de29