diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..932f948
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.pyc
+__pycache__/
+*.egg-info/
+outputs/
+trained_models/*.ckpt
+monoscene_output/*
+core*
+images
+.vscode
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..20a532a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,308 @@
+# MonoScene: Monocular 3D Semantic Scene Completion
+
+**MonoScene: Monocular 3D Semantic Scene Completion**] \[[arXiv + supp](https://arxiv.org/abs/2112.00726)\] | \[[Project page](https://cv-rits.github.io/MonoScene/)\] \
+[Anh-Quan Cao](https://anhquancao.github.io),
+[Raoul de Charette](https://team.inria.fr/rits/membres/raoul-de-charette/)
+Inria, Paris, France
+
+If you find this work useful, please cite our [paper](https://arxiv.org/abs/2112.00726):
+```
+@misc{cao2021monoscene,
+ title={MonoScene: Monocular 3D Semantic Scene Completion},
+ author={Anh-Quan Cao and Raoul de Charette},
+ year={2021},
+ eprint={2112.00726},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
+# Teaser
+
+
+|SemanticKITTI | KITTI-360
(Trained on SemanticKITTI) |
+|:------------:|:------:|
+|||
+
+
+
+ NYUv2 +
++ +
+ +# Table of Content +- [Preparing MonoScene](#preparing-monoscene) + - [Installation](#installation) + - [Datasets](#datasets) + - [Pretrained models](#pretrained-models) +- [Running MonoScene](#running-monoscene) + - [Training](#training) + - [Evaluating](#evaluating) +- [Inference & Visualization](#inference--visualization) + - [Inference](#inference) + - [Visualization](#visualization) +- [License](#license) + + +# Preparing MonoScene + +## Installation + +1. Create conda environment: + +``` +$ conda create -y -n monoscene python=3.7 +$ conda activate monoscene +``` +2. This code was implemented with python 3.7, pytorch 1.7.1 and CUDA 10.2. Please install [PyTorch](https://pytorch.org/): + +``` +$ conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.2 -c pytorch +``` + +3. Install the additional dependencies: + +``` +$ cd MonoScene/ +$ pip install -r requirements.txt +``` + +4. Install tbb: + +``` +$ conda install tbb +``` + +5. Finally, install MonoScene: + +``` +$ pip install -e ./ +``` + + +## Datasets + + +### SemanticKITTI + +1. Download the Full Semantic Scene Completion dataset (v1.1) from the [SemanticKITII website](http://www.semantic-kitti.org/dataset.html). + + +2. Create a folder to store SemanticKITTI preprocess data at `/path/to/kitti/preprocess/folder`. + +3. Store paths in environment variables for faster access (**Note: folder 'dataset' is in /path/to/semantic_kitti**): + +``` +$ export KITTI_PREPROCESS=/path/to/kitti/preprocess/folder +$ export KITTI_ROOT=/path/to/semantic_kitti +``` + +4. Preprocess the data to generate labels at a lower scale, which are used to compute the ground truth relation matrices: + +``` +$ cd MonoScene/ +$ python monoscene/data/semantic_kitti/preprocess.py kitti_root=$KITTI_ROOT kitti_preprocess_root=$KITTI_PREPROCESS +``` + +### NYUv2 + +1. Download the NYUv2 dataset using [this script](https://github.com/shurans/sscnet/blob/master/download_data.sh). + +2. Create a folder to store NYUv2 preprocess data at `/path/to/NYU/preprocess/folder`. + +3. Store paths in environment variables for faster access: + +``` +$ export NYU_PREPROCESS=/path/to/NYU/preprocess/folder +$ export NYU_ROOT=/path/to/NYU/depthbin +``` + +4. Preprocess the data to generate labels at a lower scale, which are used to compute the ground truth relation matrices: + +``` +$ cd MonoScene/ +$ python monoscene/data/NYU/preprocess.py NYU_root=$NYU_ROOT NYU_preprocess_root=$NYU_PREPROCESS + +``` + +### KITTI-360 + +1. We only perform inference on KITTI-360. You can download either the **Perspective Images for Train & Val (128G)** or the **Perspective Images for Test (1.5G)** at [http://www.cvlibs.net/datasets/kitti-360/download.php](http://www.cvlibs.net/datasets/kitti-360/download.php). + +2. Create a folder to store KITTI-360 data at `/path/to/KITTI-360/folder`. + +3. Store paths in environment variables for faster access: + +``` +$ export KITTI_360_ROOT=/path/to/KITTI-360 +``` + +## Pretrained models + +Download MonoScene pretrained models [on SemanticKITTI](https://ln5.sync.com/dl/af5cebf20/wzyre4yr-2tbj4s2a-rk9pqtk7-6rnujurg) and [on NYUv2](https://ln5.sync.com/dl/f4646c5b0/24anmiua-rp5pnie8-jwbrwax5-smxn5fqi), then put them in the folder `/path/to/MonoScene/trained_models`. + + +# Running MonoScene + +## Training + +To train MonoScene with SemanticKITTI, type: + +### SemanticKITTI + +1. Create folders to store training logs at **/path/to/kitti/logdir**. + +2. Store in an environment variable: + +``` +$ export KITTI_LOG=/path/to/kitti/logdir +``` + +3. Train MonoScene using 4 GPUs with batch_size of 4 (1 item per GPU) on Semantic KITTI: + +``` +$ cd MonoScene/ +$ python monoscene/scripts/train_monoscene.py \ + dataset=kitti \ + enable_log=true \ + kitti_root=$KITTI_ROOT \ + kitti_preprocess_root=$KITTI_PREPROCESS\ + kitti_logdir=$KITTI_LOG \ + n_gpus=4 batch_size=4 +``` + +### NYUv2 + +1. Create folders to store training logs at **/path/to/NYU/logdir**. + +2. Store in an environment variable: + +``` +$ export NYU_LOG=/path/to/NYU/logdir +``` + +3. Train MonoScene using 2 GPUs with batch_size of 4 (2 item per GPU) on NYUv2: +``` +$ cd MonoScene/ +$ python monoscene/scripts/train_monoscene.py \ + dataset=NYU \ + NYU_root=$NYU_ROOT \ + NYU_preprocess_root=$NYU_PREPROCESS \ + logdir=$NYU_LOG \ + n_gpus=2 batch_size=4 + +``` + + +## Evaluating + +### SemanticKITTI + +To evaluate MonoScene on SemanticKITTI validation set, type: + +``` +$ cd MonoScene/ +$ python monoscene/scripts/eval_monoscene.py \ + dataset=kitti \ + kitti_root=$KITTI_ROOT \ + kitti_preprocess_root=$KITTI_PREPROCESS \ + n_gpus=1 batch_size=1 +``` + +### NYUv2 + +To evaluate MonoScene on NYUv2 test set, type: + +``` +$ cd MonoScene/ +$ python monoscene/scripts/eval_monoscene.py \ + dataset=NYU \ + NYU_root=$NYU_ROOT\ + NYU_preprocess_root=$NYU_PREPROCESS \ + n_gpus=1 batch_size=1 +``` + +# Inference & Visualization + +## Inference + +Please create folder **/path/to/monoscene/output** to store the MonoScene outputs and store in environment variable: + +``` +export MONOSCENE_OUTPUT=/path/to/monoscene/output +``` + +### NYUv2 + +To generate the predictions on the NYUv2 test set, type: + +``` +$ cd MonoScene/ +$ python monoscene/scripts/generate_output.py \ + +output_path=$MONOSCENE_OUTPUT \ + dataset=NYU \ + NYU_root=$NYU_ROOT \ + NYU_preprocess_root=$NYU_PREPROCESS \ + n_gpus=1 batch_size=1 +``` + +### Semantic KITTI + +To generate the predictions on the Semantic KITTI validation set, type: + +``` +$ cd MonoScene/ +$ python monoscene/scripts/generate_output.py \ + +output_path=$MONOSCENE_OUTPUT \ + dataset=kitti \ + kitti_root=$KITTI_ROOT \ + kitti_preprocess_root=$KITTI_PREPROCESS \ + n_gpus=1 batch_size=1 +``` + +### KITTI-360 + +Here we use the sequence **2013_05_28_drive_0009_sync**, you can use other sequences. To generate the predictions on KITTI-360, type: + +``` +$ cd MonoScene/ +$ python monoscene/scripts/generate_output.py \ + +output_path=/path/to/monoscene/output \ + dataset=kitti_360 \ + +kitti_360_root=$KITTI_360_ROOT \ + +kitti_360_sequence=2013_05_28_drive_0009_sync \ + n_gpus=1 batch_size=1 +``` + +## Visualization + +We use mayavi to visualize the predictions. Please install mayavi following the [official installation instruction](https://docs.enthought.com/mayavi/mayavi/installation.html). Then, use the following commands to visualize the outputs on respective datasets. + +### NYUv2 + +``` +$ cd MonoScene/ +$ python monoscene/scripts/visualization/NYU_vis_pred.py +file=/path/to/output/file.pkl +``` + +### Semantic KITTI + +``` +$ cd MonoScene/ +$ python monoscene/scripts/visualization/kitti_vis_pred.py +file=/path/to/output/file.pkl +dataset=kitt +``` + + +### KITTI-360 + +``` +$ cd MonoScene/ +$ python monoscene/scripts/visualization/kitti_vis_pred.py +file=/path/to/output/file.pkl +dataset=kitti_360 +``` + + + +# License +MonoScene is released under the [Apache 2.0 license](./LICENSE). diff --git a/monoscene/config/monoscene.yaml b/monoscene/config/monoscene.yaml new file mode 100644 index 0000000..f2eafe5 --- /dev/null +++ b/monoscene/config/monoscene.yaml @@ -0,0 +1,39 @@ +#dataset: "NYU" # "kitti", "kitti_360" +dataset: "kitti_360" + +n_relations: 4 + +enable_log: false +kitti_root: '/path/to/semantic_kitti' +kitti_preprocess_root: '/path/to/kitti/preprocess/folder' +kitti_logdir: '/path/to/semantic_kitti/logdir' + +NYU_root: '/path/to/NYU/depthbin' +NYU_preprocess_root: '/path/to/NYU/preprocess/folder' +logdir: '/path/to/NYU/logdir' + + +fp_loss: true +frustum_size: 8 +batch_size: 1 +n_gpus: 1 +num_workers_per_gpu: 3 +exp_prefix: "exp" +run: 1 +lr: 1e-4 +weight_decay: 1e-4 + +context_prior: true + +relation_loss: true +CE_ssc_loss: true +sem_scal_loss: true +geo_scal_loss: true + +project_1_2: true +project_1_4: true +project_1_8: true + + + + diff --git a/monoscene/data/NYU/collate.py b/monoscene/data/NYU/collate.py new file mode 100644 index 0000000..e8db095 --- /dev/null +++ b/monoscene/data/NYU/collate.py @@ -0,0 +1,50 @@ +import torch + + +def collate_fn(batch): + data = {} + imgs = [] + targets = [] + names = [] + cam_poses = [] + + vox_origins = [] + cam_ks = [] + + CP_mega_matrices = [] + + data["projected_pix_1"] = [] + data["fov_mask_1"] = [] + data["frustums_masks"] = [] + data["frustums_class_dists"] = [] + + for idx, input_dict in enumerate(batch): + CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"])) + for key in data: + if key in input_dict: + data[key].append(torch.from_numpy(input_dict[key])) + + cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double()) + cam_poses.append(torch.from_numpy(input_dict["cam_pose"]).float()) + vox_origins.append(torch.from_numpy(input_dict["voxel_origin"]).double()) + + names.append(input_dict["name"]) + + img = input_dict["img"] + imgs.append(img) + + target = torch.from_numpy(input_dict["target"]) + targets.append(target) + + ret_data = { + "CP_mega_matrices": CP_mega_matrices, + "cam_pose": torch.stack(cam_poses), + "cam_k": torch.stack(cam_ks), + "vox_origin": torch.stack(vox_origins), + "name": names, + "img": torch.stack(imgs), + "target": torch.stack(targets), + } + for key in data: + ret_data[key] = data[key] + return ret_data diff --git a/monoscene/data/NYU/nyu_dataset.py b/monoscene/data/NYU/nyu_dataset.py new file mode 100644 index 0000000..950226e --- /dev/null +++ b/monoscene/data/NYU/nyu_dataset.py @@ -0,0 +1,132 @@ +import torch +import os +import glob +from torch.utils.data import Dataset +import numpy as np +from PIL import Image +from torchvision import transforms +from monoscene.data.utils.helpers import ( + vox2pix, + compute_local_frustums, + compute_CP_mega_matrix, +) +import pickle +import torch.nn.functional as F + + +class NYUDataset(Dataset): + def __init__( + self, + split, + root, + preprocess_root, + n_relations=4, + color_jitter=None, + frustum_size=4, + fliplr=0.0, + ): + self.n_relations = n_relations + self.frustum_size = frustum_size + self.n_classes = 12 + self.root = os.path.join(root, "NYU" + split) + self.preprocess_root = preprocess_root + self.base_dir = os.path.join(preprocess_root, "base", "NYU" + split) + self.fliplr = fliplr + + self.voxel_size = 0.08 # 0.08m + self.scene_size = (4.8, 4.8, 2.88) # (4.8m, 4.8m, 2.88m) + self.img_W = 640 + self.img_H = 480 + self.cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]]) + + self.color_jitter = ( + transforms.ColorJitter(*color_jitter) if color_jitter else None + ) + + self.scan_names = glob.glob(os.path.join(self.root, "*.bin")) + + self.normalize_rgb = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + + def __getitem__(self, index): + file_path = self.scan_names[index] + filename = os.path.basename(file_path) + name = filename[:-4] + + os.makedirs(self.base_dir, exist_ok=True) + filepath = os.path.join(self.base_dir, name + ".pkl") + + with open(filepath, "rb") as handle: + data = pickle.load(handle) + + cam_pose = data["cam_pose"] + T_world_2_cam = np.linalg.inv(cam_pose) + vox_origin = data["voxel_origin"] + data["cam_k"] = self.cam_k + target = data[ + "target_1_4" + ] # Following SSC literature, the output resolution on NYUv2 is set to 1:4 + data["target"] = target + target_1_4 = data["target_1_16"] + + CP_mega_matrix = compute_CP_mega_matrix( + target_1_4, is_binary=self.n_relations == 2 + ) + data["CP_mega_matrix"] = CP_mega_matrix + + # compute the 3D-2D mapping + projected_pix, fov_mask, sensor_distance = vox2pix( + T_world_2_cam, + self.cam_k, + vox_origin, + self.voxel_size, + self.img_W, + self.img_H, + self.scene_size, + ) + data["projected_pix_1"] = projected_pix + data["fov_mask_1"] = fov_mask + + # compute the masks, each indicates voxels inside a frustum + frustums_masks, frustums_class_dists = compute_local_frustums( + projected_pix, + sensor_distance, + target, + self.img_W, + self.img_H, + dataset="NYU", + n_classes=12, + size=self.frustum_size, + ) + data["frustums_masks"] = frustums_masks + data["frustums_class_dists"] = frustums_class_dists + + rgb_path = os.path.join(self.root, name + "_color.jpg") + img = Image.open(rgb_path).convert("RGB") + + # Image augmentation + if self.color_jitter is not None: + img = self.color_jitter(img) + + # PIL to numpy + img = np.array(img, dtype=np.float32, copy=False) / 255.0 + + # randomly fliplr the image + if np.random.rand() < self.fliplr: + img = np.ascontiguousarray(np.fliplr(img)) + data["projected_pix_1"][:, 0] = ( + img.shape[1] - 1 - data["projected_pix_1"][:, 0] + ) + + data["img"] = self.normalize_rgb(img) # (3, img_H, img_W) + + return data + + def __len__(self): + return len(self.scan_names) diff --git a/monoscene/data/NYU/nyu_dm.py b/monoscene/data/NYU/nyu_dm.py new file mode 100644 index 0000000..50d12d1 --- /dev/null +++ b/monoscene/data/NYU/nyu_dm.py @@ -0,0 +1,78 @@ +from torch.utils.data.dataloader import DataLoader +from monoscene.data.NYU.nyu_dataset import NYUDataset +from monoscene.data.NYU.collate import collate_fn +import pytorch_lightning as pl +from monoscene.data.utils.torch_util import worker_init_fn + + +class NYUDataModule(pl.LightningDataModule): + def __init__( + self, + root, + preprocess_root, + n_relations=4, + batch_size=4, + frustum_size=4, + num_workers=6, + ): + super().__init__() + self.n_relations = n_relations + self.preprocess_root = preprocess_root + self.root = root + self.batch_size = batch_size + self.num_workers = num_workers + self.frustum_size = frustum_size + + def setup(self): + self.train_ds = NYUDataset( + split="train", + preprocess_root=self.preprocess_root, + n_relations=self.n_relations, + root=self.root, + fliplr=0.5, + frustum_size=self.frustum_size, + color_jitter=(0.4, 0.4, 0.4), + ) + self.test_ds = NYUDataset( + split="test", + preprocess_root=self.preprocess_root, + n_relations=self.n_relations, + root=self.root, + frustum_size=self.frustum_size, + fliplr=0.0, + color_jitter=None, + ) + + def train_dataloader(self): + return DataLoader( + self.train_ds, + batch_size=self.batch_size, + drop_last=True, + num_workers=self.num_workers, + shuffle=True, + pin_memory=True, + worker_init_fn=worker_init_fn, + collate_fn=collate_fn, + ) + + def val_dataloader(self): + return DataLoader( + self.test_ds, + batch_size=self.batch_size, + num_workers=self.num_workers, + drop_last=False, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + ) + + def test_dataloader(self): + return DataLoader( + self.test_ds, + batch_size=self.batch_size, + num_workers=self.num_workers, + drop_last=False, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + ) diff --git a/monoscene/data/NYU/params.py b/monoscene/data/NYU/params.py new file mode 100644 index 0000000..332e19e --- /dev/null +++ b/monoscene/data/NYU/params.py @@ -0,0 +1,54 @@ +import torch +import numpy as np + +NYU_class_names = [ + "empty", + "ceiling", + "floor", + "wall", + "window", + "chair", + "bed", + "sofa", + "table", + "tvs", + "furn", + "objs", +] +class_weights = torch.FloatTensor([0.05, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + +class_freq_1_4 = np.array( + [ + 43744234, + 80205, + 1070052, + 905632, + 116952, + 180994, + 436852, + 279714, + 254611, + 28247, + 1805949, + 850724, + ] +) +class_freq_1_8 = np.array( + [ + 5176253, + 17277, + 220105, + 183849, + 21827, + 33520, + 67022, + 44248, + 46615, + 4419, + 290218, + 142573, + ] +) +class_freq_1_16 = np.array( + [587620, 3820, 46836, 36256, 4241, 5978, 10939, 8000, 8224, 781, 49778, 25864] +) diff --git a/monoscene/data/NYU/preprocess.py b/monoscene/data/NYU/preprocess.py new file mode 100644 index 0000000..83f7aa2 --- /dev/null +++ b/monoscene/data/NYU/preprocess.py @@ -0,0 +1,182 @@ +import numpy as np +from tqdm import tqdm +import numpy.matlib +import os +import glob +import pickle +import hydra +from omegaconf import DictConfig + + +seg_class_map = [ + 0, + 1, + 2, + 3, + 4, + 11, + 5, + 6, + 7, + 8, + 8, + 10, + 10, + 10, + 11, + 11, + 9, + 8, + 11, + 11, + 11, + 11, + 11, + 11, + 11, + 11, + 11, + 10, + 10, + 11, + 8, + 10, + 11, + 9, + 11, + 11, + 11, +] + + +def _rle2voxel(rle, voxel_size=(240, 144, 240), rle_filename=""): + r"""Read voxel label data from file (RLE compression), and convert it to fully occupancy labeled voxels. + code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L172 + In the data loader of pytorch, only single thread is allowed. + For multi-threads version and more details, see 'readRLE.py'. + output: seg_label: 3D numpy array, size 240 x 144 x 240 + """ + seg_label = np.zeros( + int(voxel_size[0] * voxel_size[1] * voxel_size[2]), dtype=np.uint8 + ) # segmentation label + vox_idx = 0 + for idx in range(int(rle.shape[0] / 2)): + check_val = rle[idx * 2] + check_iter = rle[idx * 2 + 1] + if check_val >= 37 and check_val != 255: # 37 classes to 12 classes + print("RLE {} check_val: {}".format(rle_filename, check_val)) + seg_label_val = ( + seg_class_map[check_val] if check_val != 255 else 255 + ) # 37 classes to 12 classes + seg_label[vox_idx : vox_idx + check_iter] = np.matlib.repmat( + seg_label_val, 1, check_iter + ) + vox_idx = vox_idx + check_iter + seg_label = seg_label.reshape(voxel_size) # 3D array, size 240 x 144 x 240 + return seg_label + + +def _read_rle(rle_filename): # 0.0005s + """Read RLE compression data + code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L153 + Return: + vox_origin, + cam_pose, + vox_rle, voxel label data from file + Shape: + vox_rle, (240, 144, 240) + """ + fid = open(rle_filename, "rb") + vox_origin = np.fromfile( + fid, np.float32, 3 + ).T # Read voxel origin in world coordinates + cam_pose = np.fromfile(fid, np.float32, 16).reshape((4, 4)) # Read camera pose + vox_rle = ( + np.fromfile(fid, np.uint32).reshape((-1, 1)).T + ) # Read voxel label data from file + vox_rle = np.squeeze(vox_rle) # 2d array: (1 x N), to 1d array: (N , ) + fid.close() + return vox_origin, cam_pose, vox_rle + + +def _downsample_label(label, voxel_size=(240, 144, 240), downscale=4): + r"""downsample the labeled data, + code taken from https://github.com/waterljwant/SSC/blob/master/dataloaders/dataloader.py#L262 + Shape: + label, (240, 144, 240) + label_downscale, if downsample==4, then (60, 36, 60) + """ + if downscale == 1: + return label + ds = downscale + small_size = ( + voxel_size[0] // ds, + voxel_size[1] // ds, + voxel_size[2] // ds, + ) # small size + label_downscale = np.zeros(small_size, dtype=np.uint8) + empty_t = 0.95 * ds * ds * ds # threshold + s01 = small_size[0] * small_size[1] + label_i = np.zeros((ds, ds, ds), dtype=np.int32) + + for i in range(small_size[0] * small_size[1] * small_size[2]): + z = int(i / s01) + y = int((i - z * s01) / small_size[0]) + x = int(i - z * s01 - y * small_size[0]) + + label_i[:, :, :] = label[ + x * ds : (x + 1) * ds, y * ds : (y + 1) * ds, z * ds : (z + 1) * ds + ] + label_bin = label_i.flatten() + + zero_count_0 = np.array(np.where(label_bin == 0)).size + zero_count_255 = np.array(np.where(label_bin == 255)).size + + zero_count = zero_count_0 + zero_count_255 + if zero_count > empty_t: + label_downscale[x, y, z] = 0 if zero_count_0 > zero_count_255 else 255 + else: + label_i_s = label_bin[ + np.where(np.logical_and(label_bin > 0, label_bin < 255)) + ] + label_downscale[x, y, z] = np.argmax(np.bincount(label_i_s)) + return label_downscale + + +@hydra.main(config_name="../../config/monoscene.yaml") +def main(config: DictConfig): + scene_size = (240, 144, 240) + for split in ["train", "test"]: + root = os.path.join(config.NYU_root, "NYU" + split) + base_dir = os.path.join(config.NYU_preprocess_root, "base", "NYU" + split) + os.makedirs(base_dir, exist_ok=True) + + scans = glob.glob(os.path.join(root, "*.bin")) + for scan in tqdm(scans): + filename = os.path.basename(scan) + name = filename[:-4] + filepath = os.path.join(base_dir, name + ".pkl") + if os.path.exists(filepath): + continue + + vox_origin, cam_pose, rle = _read_rle(scan) + + target_1_1 = _rle2voxel(rle, scene_size, scan) + target_1_4 = _downsample_label(target_1_1, scene_size, 4) + target_1_16 = _downsample_label(target_1_1, scene_size, 16) + + data = { + "cam_pose": cam_pose, + "voxel_origin": vox_origin, + "name": name, + "target_1_4": target_1_4, + "target_1_16": target_1_16, + } + + with open(filepath, "wb") as handle: + pickle.dump(data, handle) + print("wrote to", filepath) + + +if __name__ == "__main__": + main() diff --git a/monoscene/data/kitti_360/collate.py b/monoscene/data/kitti_360/collate.py new file mode 100644 index 0000000..59ea29d --- /dev/null +++ b/monoscene/data/kitti_360/collate.py @@ -0,0 +1,47 @@ +import torch + + +def collate_fn(batch): + data = {} + imgs = [] + frame_ids = [] + img_paths = [] + sequences = [] + + cam_ks = [] + T_velo_2_cams = [] + + scale_3ds = batch[0]["scale_3ds"] + for scale_3d in scale_3ds: + data["projected_pix_{}".format(scale_3d)] = [] + data["fov_mask_{}".format(scale_3d)] = [] + + for _, input_dict in enumerate(batch): + if "img_path" in input_dict: + img_paths.append(input_dict["img_path"]) + + for key in data: + data[key].append(torch.from_numpy(input_dict[key])) + + cam_ks.append(torch.from_numpy(input_dict["cam_k"]).float()) + T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float()) + + sequences.append(input_dict["sequence"]) + + img = input_dict["img"] + imgs.append(img) + + frame_ids.append(input_dict["frame_id"]) + + ret_data = { + "sequence": sequences, + "frame_id": frame_ids, + "cam_k": cam_ks, + "T_velo_2_cam": T_velo_2_cams, + "img": torch.stack(imgs), + "img_path": img_paths, + } + for key in data: + ret_data[key] = data[key] + + return ret_data diff --git a/monoscene/data/kitti_360/kitti_360_dataset.py b/monoscene/data/kitti_360/kitti_360_dataset.py new file mode 100644 index 0000000..566c2ec --- /dev/null +++ b/monoscene/data/kitti_360/kitti_360_dataset.py @@ -0,0 +1,125 @@ +import torch +import os +import glob +from torch.utils.data import Dataset +import numpy as np +from monoscene.data.utils.helpers import vox2pix +from PIL import Image +from torchvision import transforms + + +class Kitti360Dataset(Dataset): + def __init__(self, root, sequences, n_scans): + """ + Paramters + -------- + root: str + Path to KITTI-360 dataset i.e. contain sequences such as 2013_05_28_drive_0009_sync + sequence: str + KITTI-360 sequence e.g. 2013_05_28_drive_0009_sync + n_scans: int + Only use the first n_scans since KITTI-360 sequence is very long + """ + self.root = root + self.img_H = 376 + self.img_W = 1408 + self.project_scale = 2 + self.output_scale = 1 + self.voxel_size = 0.2 + self.vox_origin = np.array([0, -25.6, -2]) + self.scene_size = (51.2, 51.2, 6.4) + self.T_velo_2_cam = self.get_velo2cam() + self.cam_k = self.get_cam_k() + self.scans = [] + for sequence in sequences: + glob_path = os.path.join( + self.root, "data_2d_raw", sequence, "image_00/data_rect", "*.png" + ) + for img_path in glob.glob(glob_path): + self.scans.append({"img_path": img_path, "sequence": sequence}) + self.scans = self.scans[:n_scans] + self.normalize_rgb = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + + def __len__(self): + return len(self.scans) + + def get_cam_k(self): + cam_k = np.array( + [ + 552.554261, + 0.000000, + 682.049453, + 0.000000, + 0.000000, + 552.554261, + 238.769549, + 0.000000, + 0.000000, + 0.000000, + 1.000000, + 0.000000, + ] + ).reshape(3, 4) + return cam_k[:3, :3] + + def get_velo2cam(self): + cam2velo = np.array( + [ + 0.04307104361, + -0.08829286498, + 0.995162929, + 0.8043914418, + -0.999004371, + 0.007784614041, + 0.04392796942, + 0.2993489574, + -0.01162548558, + -0.9960641394, + -0.08786966659, + -0.1770225824, + ] + ).reshape(3, 4) + cam2velo = np.concatenate( + [cam2velo, np.array([0, 0, 0, 1]).reshape(1, 4)], axis=0 + ) + return np.linalg.inv(cam2velo) + + def __getitem__(self, index): + data = {"T_velo_2_cam": self.T_velo_2_cam, "cam_k": self.cam_k} + scan = self.scans[index] + img_path = scan["img_path"] + sequence = scan["sequence"] + filename = os.path.basename(img_path) + frame_id = os.path.splitext(filename)[0] + data["frame_id"] = frame_id + data["img_path"] = img_path + data["sequence"] = sequence + + img = Image.open(img_path).convert("RGB") + img = np.array(img, dtype=np.float32, copy=False) / 255.0 + img = self.normalize_rgb(img) + data["img"] = img + + scale_3ds = [self.project_scale, self.output_scale] + data["scale_3ds"] = scale_3ds + T_cam_2_velo = np.linalg.inv(self.T_velo_2_cam) + for scale_3d in scale_3ds: + projected_pix, fov_mask, _ = vox2pix( + T_cam_2_velo, + self.cam_k, + self.vox_origin, + self.voxel_size * scale_3d, + self.img_W, + self.img_H, + self.scene_size, + ) + data["projected_pix_{}".format(scale_3d)] = projected_pix + data["fov_mask_{}".format(scale_3d)] = fov_mask + return data diff --git a/monoscene/data/kitti_360/kitti_360_dm.py b/monoscene/data/kitti_360/kitti_360_dm.py new file mode 100644 index 0000000..0743de9 --- /dev/null +++ b/monoscene/data/kitti_360/kitti_360_dm.py @@ -0,0 +1,32 @@ +from torch.utils.data.dataloader import DataLoader +from monoscene.data.kitti_360.kitti_360_dataset import Kitti360Dataset +import pytorch_lightning as pl +from monoscene.data.kitti_360.collate import collate_fn +from monoscene.data.utils.torch_util import worker_init_fn + + +class Kitti360DataModule(pl.LightningDataModule): + def __init__(self, root, sequences, n_scans, batch_size=4, num_workers=3): + super().__init__() + self.root = root + self.batch_size = batch_size + self.num_workers = num_workers + self.sequences = sequences + self.n_scans = n_scans + + def setup(self, stage=None): + self.ds = Kitti360Dataset( + root=self.root, sequences=self.sequences, n_scans=self.n_scans + ) + + def dataloader(self): + return DataLoader( + self.ds, + batch_size=self.batch_size, + drop_last=False, + num_workers=self.num_workers, + shuffle=False, + pin_memory=True, + worker_init_fn=worker_init_fn, + collate_fn=collate_fn, + ) diff --git a/monoscene/data/semantic_kitti/collate.py b/monoscene/data/semantic_kitti/collate.py new file mode 100644 index 0000000..eecb8e7 --- /dev/null +++ b/monoscene/data/semantic_kitti/collate.py @@ -0,0 +1,59 @@ +import torch + + +def collate_fn(batch): + data = {} + imgs = [] + CP_mega_matrices = [] + targets = [] + frame_ids = [] + sequences = [] + + cam_ks = [] + T_velo_2_cams = [] + frustums_masks = [] + frustums_class_dists = [] + + scale_3ds = batch[0]["scale_3ds"] + for scale_3d in scale_3ds: + data["projected_pix_{}".format(scale_3d)] = [] + data["fov_mask_{}".format(scale_3d)] = [] + + for idx, input_dict in enumerate(batch): + cam_ks.append(torch.from_numpy(input_dict["cam_k"]).double()) + T_velo_2_cams.append(torch.from_numpy(input_dict["T_velo_2_cam"]).float()) + + if "frustums_masks" in input_dict: + frustums_masks.append(torch.from_numpy(input_dict["frustums_masks"])) + frustums_class_dists.append( + torch.from_numpy(input_dict["frustums_class_dists"]).float() + ) + + for key in data: + data[key].append(torch.from_numpy(input_dict[key])) + + img = input_dict["img"] + imgs.append(img) + + frame_ids.append(input_dict["frame_id"]) + sequences.append(input_dict["sequence"]) + + ret_data = { + "frame_id": frame_ids, + "sequence": sequences, + "frustums_class_dists": frustums_class_dists, + "frustums_masks": frustums_masks, + "cam_k": cam_ks, + "T_velo_2_cam": T_velo_2_cams, + "img": torch.stack(imgs), + } + if "target" in input_dict: + target = torch.from_numpy(input_dict["target"]) + targets.append(target) + CP_mega_matrices.append(torch.from_numpy(input_dict["CP_mega_matrix"])) + ret_data["CP_mega_matrices"] = CP_mega_matrices + ret_data["target"] = torch.stack(targets) + + for key in data: + ret_data[key] = data[key] + return ret_data diff --git a/monoscene/data/semantic_kitti/io_data.py b/monoscene/data/semantic_kitti/io_data.py new file mode 100644 index 0000000..5cf5ffd --- /dev/null +++ b/monoscene/data/semantic_kitti/io_data.py @@ -0,0 +1,239 @@ +""" +Most of the code in this file is taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/io_data.py +""" + +import numpy as np +import yaml +import imageio + + +def unpack(compressed): + ''' given a bit encoded voxel grid, make a normal voxel grid out of it. ''' + uncompressed = np.zeros(compressed.shape[0] * 8, dtype=np.uint8) + uncompressed[::8] = compressed[:] >> 7 & 1 + uncompressed[1::8] = compressed[:] >> 6 & 1 + uncompressed[2::8] = compressed[:] >> 5 & 1 + uncompressed[3::8] = compressed[:] >> 4 & 1 + uncompressed[4::8] = compressed[:] >> 3 & 1 + uncompressed[5::8] = compressed[:] >> 2 & 1 + uncompressed[6::8] = compressed[:] >> 1 & 1 + uncompressed[7::8] = compressed[:] & 1 + + return uncompressed + + +def img_normalize(img, mean, std): + img = img.astype(np.float32) / 255.0 + img = img - mean + img = img / std + + return img + + +def pack(array): + """ convert a boolean array into a bitwise array. """ + array = array.reshape((-1)) + + #compressing bit flags. + # yapf: disable + compressed = array[::8] << 7 | array[1::8] << 6 | array[2::8] << 5 | array[3::8] << 4 | array[4::8] << 3 | array[5::8] << 2 | array[6::8] << 1 | array[7::8] + # yapf: enable + + return np.array(compressed, dtype=np.uint8) + + +def get_grid_coords(dims, resolution): + ''' + :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32]) + :return coords_grid: is the center coords of voxels in the grid + ''' + + # The sensor in centered in X (we go to dims/2 + 1 for the histogramdd) + g_xx = np.arange(-dims[0]/2, dims[0]/2 + 1) + # The sensor is in Y=0 (we go to dims + 1 for the histogramdd) + g_yy = np.arange(0, dims[1] + 1) + # The sensor is in Z=1.73. I observed that the ground was to voxel levels above the grid bottom, so Z pose is at 10 + # if bottom voxel is 0. If we want the sensor to be at (0, 0, 0), then the bottom in z is -10, top is 22 + # (we go to 22 + 1 for the histogramdd) + # ATTENTION.. Is 11 for old grids.. 10 for new grids (v1.1) (https://github.com/PRBonn/semantic-kitti-api/issues/49) + sensor_pose = 10 + g_zz = np.arange(0 - sensor_pose, dims[2] - sensor_pose + 1) + + # Obtaining the grid with coords... + xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1]) + coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T + coords_grid = coords_grid.astype(np.float) + + coords_grid = (coords_grid * resolution) + resolution/2 + + temp = np.copy(coords_grid) + temp[:, 0] = coords_grid[:, 1] + temp[:, 1] = coords_grid[:, 0] + coords_grid = np.copy(temp) + + return coords_grid, g_xx, g_yy, g_zz + + +def _get_remap_lut(config_path): + ''' + remap_lut to remap classes of semantic kitti for training... + :return: + ''' + + dataset_config = yaml.safe_load(open(config_path, 'r')) + # make lookup table for mapping + maxkey = max(dataset_config['learning_map'].keys()) + + # +100 hack making lut bigger just in case there are unknown labels + remap_lut = np.zeros((maxkey + 100), dtype=np.int32) + remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values()) + + # in completion we have to distinguish empty and invalid voxels. + # Important: For voxels 0 corresponds to "empty" and not "unlabeled". + remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid' + remap_lut[0] = 0 # only 'empty' stays 'empty'. + + return remap_lut + + +def get_inv_map(): + ''' + remap_lut to remap classes of semantic kitti for training... + :return: + ''' + config_path = "./semantic-kitti.yaml" + dataset_config = yaml.safe_load(open(config_path, 'r')) + # make lookup table for mapping + + inv_map = np.zeros(20, dtype=np.int32) + inv_map[list(dataset_config['learning_map_inv'].keys())] = list(dataset_config['learning_map_inv'].values()) + + return inv_map + +def _read_SemKITTI(path, dtype, do_unpack): + bin = np.fromfile(path, dtype=dtype) # Flattened array + if do_unpack: + bin = unpack(bin) + return bin + + +def _read_label_SemKITTI(path): + label = _read_SemKITTI(path, dtype=np.uint16, do_unpack=False).astype(np.float32) + return label + + +def _read_invalid_SemKITTI(path): + invalid = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True) + return invalid + + +def _read_occluded_SemKITTI(path): + occluded = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True) + return occluded + + +def _read_occupancy_SemKITTI(path): + occupancy = _read_SemKITTI(path, dtype=np.uint8, do_unpack=True).astype(np.float32) + return occupancy + + +def _read_rgb_SemKITTI(path): + rgb = np.asarray(imageio.imread(path)) + return rgb + + +def _read_pointcloud_SemKITTI(path): + 'Return pointcloud semantic kitti with remissions (x, y, z, intensity)' + pointcloud = _read_SemKITTI(path, dtype=np.float32, do_unpack=False) + pointcloud = pointcloud.reshape((-1, 4)) + return pointcloud + + +def _read_calib_SemKITTI(calib_path): + """ + :param calib_path: Path to a calibration text file. + :return: dict with calibration matrices. + """ + calib_all = {} + with open(calib_path, 'r') as f: + for line in f.readlines(): + if line == '\n': + break + key, value = line.split(':', 1) + calib_all[key] = np.array([float(x) for x in value.split()]) + + # reshape matrices + calib_out = {} + calib_out['P2'] = calib_all['P2'].reshape(3, 4) # 3x4 projection matrix for left camera + calib_out['Tr'] = np.identity(4) # 4x4 matrix + calib_out['Tr'][:3, :4] = calib_all['Tr'].reshape(3, 4) + return calib_out + + +def get_remap_lut(path): + ''' + remap_lut to remap classes of semantic kitti for training... + :return: + ''' + + dataset_config = yaml.safe_load(open(path, 'r')) + + # make lookup table for mapping + maxkey = max(dataset_config['learning_map'].keys()) + + # +100 hack making lut bigger just in case there are unknown labels + remap_lut = np.zeros((maxkey + 100), dtype=np.int32) + remap_lut[list(dataset_config['learning_map'].keys())] = list(dataset_config['learning_map'].values()) + + # in completion we have to distinguish empty and invalid voxels. + # Important: For voxels 0 corresponds to "empty" and not "unlabeled". + remap_lut[remap_lut == 0] = 255 # map 0 to 'invalid' + remap_lut[0] = 0 # only 'empty' stays 'empty'. + + return remap_lut + + +def data_augmentation_3Dflips(flip, data): + # The .copy() is done to avoid negative strides of the numpy array caused by the way numpy manages the data + # into memory. This gives errors when trying to pass the array to torch sensors.. Solution seen in: + # https://discuss.pytorch.org/t/torch-from-numpy-not-support-negative-strides/3663 + # Dims -> {XZY} + # Flipping around the X axis... + if np.isclose(flip, 1): + data = np.flip(data, axis=0).copy() + + # Flipping around the Y axis... + if np.isclose(flip, 2): + data = np.flip(data, 2).copy() + + # Flipping around the X and the Y axis... + if np.isclose(flip, 3): + data = np.flip(np.flip(data, axis=0), axis=2).copy() + + return data + + +def get_cmap_semanticKITTI20(): + colors = np.array([ + # [0 , 0 , 0, 255], + [100, 150, 245, 255], + [100, 230, 245, 255], + [30, 60, 150, 255], + [80, 30, 180, 255], + [100, 80, 250, 255], + [255, 30, 30, 255], + [255, 40, 200, 255], + [150, 30, 90, 255], + [255, 0, 255, 255], + [255, 150, 255, 255], + [75, 0, 75, 255], + [175, 0, 75, 255], + [255, 200, 0, 255], + [255, 120, 50, 255], + [0, 175, 0, 255], + [135, 60, 0, 255], + [150, 240, 80, 255], + [255, 240, 150, 255], + [255, 0, 0, 255]]).astype(np.uint8) + + return colors diff --git a/monoscene/data/semantic_kitti/kitti_dataset.py b/monoscene/data/semantic_kitti/kitti_dataset.py new file mode 100644 index 0000000..980b052 --- /dev/null +++ b/monoscene/data/semantic_kitti/kitti_dataset.py @@ -0,0 +1,200 @@ +import torch +import os +import glob +from torch.utils.data import Dataset +import numpy as np +from PIL import Image +from torchvision import transforms +from monoscene.data.utils.helpers import ( + vox2pix, + compute_local_frustums, + compute_CP_mega_matrix, +) + + +class KittiDataset(Dataset): + def __init__( + self, + split, + root, + preprocess_root, + project_scale=2, + frustum_size=4, + color_jitter=None, + fliplr=0.0, + ): + super().__init__() + self.root = root + self.label_root = os.path.join(preprocess_root, "labels") + self.n_classes = 20 + splits = { + "train": ["00", "01", "02", "03", "04", "05", "06", "07", "09", "10"], + "val": ["08"], + "test": ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"], + } + self.split = split + self.sequences = splits[split] + self.frustum_size = frustum_size + self.project_scale = project_scale + self.output_scale = int(self.project_scale / 2) + self.scene_size = (51.2, 51.2, 6.4) + self.vox_origin = np.array([0, -25.6, -2]) + self.fliplr = fliplr + + self.voxel_size = 0.2 # 0.2m + self.img_W = 1220 + self.img_H = 370 + + self.color_jitter = ( + transforms.ColorJitter(*color_jitter) if color_jitter else None + ) + self.scans = [] + for sequence in self.sequences: + calib = self.read_calib( + os.path.join(self.root, "dataset", "sequences", sequence, "calib.txt") + ) + P = calib["P2"] + T_velo_2_cam = calib["Tr"] + proj_matrix = P @ T_velo_2_cam + + glob_path = os.path.join( + self.root, "dataset", "sequences", sequence, "voxels", "*.bin" + ) + for voxel_path in glob.glob(glob_path): + self.scans.append( + { + "sequence": sequence, + "P": P, + "T_velo_2_cam": T_velo_2_cam, + "proj_matrix": proj_matrix, + "voxel_path": voxel_path, + } + ) + + self.normalize_rgb = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + + def __getitem__(self, index): + scan = self.scans[index] + voxel_path = scan["voxel_path"] + sequence = scan["sequence"] + P = scan["P"] + T_velo_2_cam = scan["T_velo_2_cam"] + proj_matrix = scan["proj_matrix"] + + filename = os.path.basename(voxel_path) + frame_id = os.path.splitext(filename)[0] + + rgb_path = os.path.join( + self.root, "dataset", "sequences", sequence, "image_2", frame_id + ".png" + ) + + data = { + "frame_id": frame_id, + "sequence": sequence, + "P": P, + "T_velo_2_cam": T_velo_2_cam, + "proj_matrix": proj_matrix, + } + scale_3ds = [self.output_scale, self.project_scale] + data["scale_3ds"] = scale_3ds + cam_k = P[0:3, 0:3] + data["cam_k"] = cam_k + for scale_3d in scale_3ds: + + # compute the 3D-2D mapping + projected_pix, fov_mask, sensor_distance = vox2pix( + T_velo_2_cam, + cam_k, + self.vox_origin, + self.voxel_size * scale_3d, + self.img_W, + self.img_H, + self.scene_size, + ) + + data["projected_pix_{}".format(scale_3d)] = projected_pix + data["sensor_distance_{}".format(scale_3d)] = sensor_distance + data["fov_mask_{}".format(scale_3d)] = fov_mask + + target_1_path = os.path.join(self.label_root, sequence, frame_id + "_1_1.npy") + target = np.load(target_1_path) + data["target"] = target + target_8_path = os.path.join(self.label_root, sequence, frame_id + "_1_8.npy") + target_1_8 = np.load(target_8_path) + CP_mega_matrix = compute_CP_mega_matrix(target_1_8) + data["CP_mega_matrix"] = CP_mega_matrix + + # Compute the masks, each indicate the voxels of a local frustum + if self.split != "test": + projected_pix_output = data["projected_pix_{}".format(self.output_scale)] + sensor_distance_output = data[ + "sensor_distance_{}".format(self.output_scale) + ] + frustums_masks, frustums_class_dists = compute_local_frustums( + projected_pix_output, + sensor_distance_output, + target, + self.img_W, + self.img_H, + dataset="kitti", + n_classes=20, + size=self.frustum_size, + ) + else: + frustums_masks = None + frustums_class_dists = None + data["frustums_masks"] = frustums_masks + data["frustums_class_dists"] = frustums_class_dists + + img = Image.open(rgb_path).convert("RGB") + + # Image augmentation + if self.color_jitter is not None: + img = self.color_jitter(img) + + # PIL to numpy + img = np.array(img, dtype=np.float32, copy=False) / 255.0 + img = img[:370, :1220, :] # crop image + + # Fliplr the image + if np.random.rand() < self.fliplr: + img = np.ascontiguousarray(np.fliplr(img)) + for scale in scale_3ds: + key = "projected_pix_" + str(scale) + data[key][:, 0] = img.shape[1] - 1 - data[key][:, 0] + + data["img"] = self.normalize_rgb(img) + return data + + def __len__(self): + return len(self.scans) + + @staticmethod + def read_calib(calib_path): + """ + Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68 + :param calib_path: Path to a calibration text file. + :return: dict with calibration matrices. + """ + calib_all = {} + with open(calib_path, "r") as f: + for line in f.readlines(): + if line == "\n": + break + key, value = line.split(":", 1) + calib_all[key] = np.array([float(x) for x in value.split()]) + + # reshape matrices + calib_out = {} + # 3x4 projection matrix for left camera + calib_out["P2"] = calib_all["P2"].reshape(3, 4) + calib_out["Tr"] = np.identity(4) # 4x4 matrix + calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4) + return calib_out diff --git a/monoscene/data/semantic_kitti/kitti_dm.py b/monoscene/data/semantic_kitti/kitti_dm.py new file mode 100644 index 0000000..e9dcb4d --- /dev/null +++ b/monoscene/data/semantic_kitti/kitti_dm.py @@ -0,0 +1,91 @@ +from torch.utils.data.dataloader import DataLoader +from monoscene.data.semantic_kitti.kitti_dataset import KittiDataset +import pytorch_lightning as pl +from monoscene.data.semantic_kitti.collate import collate_fn +from monoscene.data.utils.torch_util import worker_init_fn + + +class KittiDataModule(pl.LightningDataModule): + def __init__( + self, + root, + preprocess_root, + project_scale=2, + frustum_size=4, + batch_size=4, + num_workers=6, + ): + super().__init__() + self.root = root + self.preprocess_root = preprocess_root + self.project_scale = project_scale + self.batch_size = batch_size + self.num_workers = num_workers + self.frustum_size = frustum_size + + def setup(self, stage=None): + self.train_ds = KittiDataset( + split="train", + root=self.root, + preprocess_root=self.preprocess_root, + project_scale=self.project_scale, + frustum_size=self.frustum_size, + fliplr=0.5, + color_jitter=(0.4, 0.4, 0.4), + ) + + self.val_ds = KittiDataset( + split="val", + root=self.root, + preprocess_root=self.preprocess_root, + project_scale=self.project_scale, + frustum_size=self.frustum_size, + fliplr=0, + color_jitter=None, + ) + + self.test_ds = KittiDataset( + split="test", + root=self.root, + preprocess_root=self.preprocess_root, + project_scale=self.project_scale, + frustum_size=self.frustum_size, + fliplr=0, + color_jitter=None, + ) + + def train_dataloader(self): + return DataLoader( + self.train_ds, + batch_size=self.batch_size, + drop_last=True, + num_workers=self.num_workers, + shuffle=True, + pin_memory=True, + worker_init_fn=worker_init_fn, + collate_fn=collate_fn, + ) + + def val_dataloader(self): + return DataLoader( + self.val_ds, + batch_size=self.batch_size, + drop_last=False, + num_workers=self.num_workers, + shuffle=False, + pin_memory=True, + worker_init_fn=worker_init_fn, + collate_fn=collate_fn, + ) + + def test_dataloader(self): + return DataLoader( + self.test_ds, + batch_size=self.batch_size, + drop_last=False, + num_workers=self.num_workers, + shuffle=False, + pin_memory=True, + worker_init_fn=worker_init_fn, + collate_fn=collate_fn, + ) diff --git a/monoscene/data/semantic_kitti/params.py b/monoscene/data/semantic_kitti/params.py new file mode 100644 index 0000000..c58df35 --- /dev/null +++ b/monoscene/data/semantic_kitti/params.py @@ -0,0 +1,48 @@ +import numpy as np + +semantic_kitti_class_frequencies = np.array( + [ + 5.41773033e09, + 1.57835390e07, + 1.25136000e05, + 1.18809000e05, + 6.46799000e05, + 8.21951000e05, + 2.62978000e05, + 2.83696000e05, + 2.04750000e05, + 6.16887030e07, + 4.50296100e06, + 4.48836500e07, + 2.26992300e06, + 5.68402180e07, + 1.57196520e07, + 1.58442623e08, + 2.06162300e06, + 3.69705220e07, + 1.15198800e06, + 3.34146000e05, + ] +) +kitti_class_names = [ + "empty", + "car", + "bicycle", + "motorcycle", + "truck", + "other-vehicle", + "person", + "bicyclist", + "motorcyclist", + "road", + "parking", + "sidewalk", + "other-ground", + "building", + "fence", + "vegetation", + "trunk", + "terrain", + "pole", + "traffic-sign", +] diff --git a/monoscene/data/semantic_kitti/preprocess.py b/monoscene/data/semantic_kitti/preprocess.py new file mode 100644 index 0000000..31bf76c --- /dev/null +++ b/monoscene/data/semantic_kitti/preprocess.py @@ -0,0 +1,102 @@ +""" +Code partly taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/data/labels_downscale.py +""" +import numpy as np +from tqdm import tqdm +import numpy.matlib +import os +import glob +import hydra +from omegaconf import DictConfig +import monoscene.data.semantic_kitti.io_data as SemanticKittiIO +from hydra.utils import get_original_cwd +from monoscene.data.NYU.preprocess import _downsample_label + + +def majority_pooling(grid, k_size=2): + result = np.zeros( + (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size) + ) + for xx in range(0, int(np.floor(grid.shape[0] / k_size))): + for yy in range(0, int(np.floor(grid.shape[1] / k_size))): + for zz in range(0, int(np.floor(grid.shape[2] / k_size))): + + sub_m = grid[ + (xx * k_size) : (xx * k_size) + k_size, + (yy * k_size) : (yy * k_size) + k_size, + (zz * k_size) : (zz * k_size) + k_size, + ] + unique, counts = np.unique(sub_m, return_counts=True) + if True in ((unique != 0) & (unique != 255)): + # Remove counts with 0 and 255 + counts = counts[((unique != 0) & (unique != 255))] + unique = unique[((unique != 0) & (unique != 255))] + else: + if True in (unique == 0): + counts = counts[(unique != 255)] + unique = unique[(unique != 255)] + value = unique[np.argmax(counts)] + result[xx, yy, zz] = value + return result + + +@hydra.main(config_name="../../config/monoscene.yaml") +def main(config: DictConfig): + scene_size = (256, 256, 32) + sequences = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + remap_lut = SemanticKittiIO.get_remap_lut( + os.path.join( + get_original_cwd(), + "monoscene", + "data", + "semantic_kitti", + "semantic-kitti.yaml", + ) + ) + + for sequence in sequences: + sequence_path = os.path.join( + config.kitti_root, "dataset", "sequences", sequence + ) + label_paths = sorted( + glob.glob(os.path.join(sequence_path, "voxels", "*.label")) + ) + invalid_paths = sorted( + glob.glob(os.path.join(sequence_path, "voxels", "*.invalid")) + ) + out_dir = os.path.join(config.kitti_preprocess_root, "labels", sequence) + os.makedirs(out_dir, exist_ok=True) + + downscaling = {"1_1": 1, "1_8": 8} + + for i in tqdm(range(len(label_paths))): + + frame_id, extension = os.path.splitext(os.path.basename(label_paths[i])) + + LABEL = SemanticKittiIO._read_label_SemKITTI(label_paths[i]) + INVALID = SemanticKittiIO._read_invalid_SemKITTI(invalid_paths[i]) + LABEL = remap_lut[LABEL.astype(np.uint16)].astype( + np.float32 + ) # Remap 20 classes semanticKITTI SSC + LABEL[ + np.isclose(INVALID, 1) + ] = 255 # Setting to unknown all voxels marked on invalid mask... + LABEL = LABEL.reshape([256, 256, 32]) + + for scale in downscaling: + filename = frame_id + "_" + scale + ".npy" + label_filename = os.path.join(out_dir, filename) + # If files have not been created... + if not os.path.exists(label_filename): + if scale == "1_8": + LABEL_ds = _downsample_label( + LABEL, (256, 256, 32), downscaling[scale] + ) + else: + LABEL_ds = LABEL + np.save(label_filename, LABEL_ds) + print("wrote to", label_filename) + + +if __name__ == "__main__": + main() diff --git a/monoscene/data/semantic_kitti/semantic-kitti.yaml b/monoscene/data/semantic_kitti/semantic-kitti.yaml new file mode 100644 index 0000000..d6749c3 --- /dev/null +++ b/monoscene/data/semantic_kitti/semantic-kitti.yaml @@ -0,0 +1,213 @@ +# This file is covered by the LICENSE file in the root of this project. +nbr_classes: 20 +grid_dims: [256, 32, 256] # (W, H, D) +labels: + 0 : "unlabeled" + 1 : "outlier" + 10: "car" + 11: "bicycle" + 13: "bus" + 15: "motorcycle" + 16: "on-rails" + 18: "truck" + 20: "other-vehicle" + 30: "person" + 31: "bicyclist" + 32: "motorcyclist" + 40: "road" + 44: "parking" + 48: "sidewalk" + 49: "other-ground" + 50: "building" + 51: "fence" + 52: "other-structure" + 60: "lane-marking" + 70: "vegetation" + 71: "trunk" + 72: "terrain" + 80: "pole" + 81: "traffic-sign" + 99: "other-object" + 252: "moving-car" + 253: "moving-bicyclist" + 254: "moving-person" + 255: "moving-motorcyclist" + 256: "moving-on-rails" + 257: "moving-bus" + 258: "moving-truck" + 259: "moving-other-vehicle" +color_map: # bgr + 0 : [0, 0, 0] + 1 : [0, 0, 255] + 10: [245, 150, 100] + 11: [245, 230, 100] + 13: [250, 80, 100] + 15: [150, 60, 30] + 16: [255, 0, 0] + 18: [180, 30, 80] + 20: [255, 0, 0] + 30: [30, 30, 255] + 31: [200, 40, 255] + 32: [90, 30, 150] + 40: [255, 0, 255] + 44: [255, 150, 255] + 48: [75, 0, 75] + 49: [75, 0, 175] + 50: [0, 200, 255] + 51: [50, 120, 255] + 52: [0, 150, 255] + 60: [170, 255, 150] + 70: [0, 175, 0] + 71: [0, 60, 135] + 72: [80, 240, 150] + 80: [150, 240, 255] + 81: [0, 0, 255] + 99: [255, 255, 50] + 252: [245, 150, 100] + 256: [255, 0, 0] + 253: [200, 40, 255] + 254: [30, 30, 255] + 255: [90, 30, 150] + 257: [250, 80, 100] + 258: [180, 30, 80] + 259: [255, 0, 0] +content: # as a ratio with the total number of points + 0: 0.018889854628292943 + 1: 0.0002937197336781505 + 10: 0.040818519255974316 + 11: 0.00016609538710764618 + 13: 2.7879693665067774e-05 + 15: 0.00039838616015114444 + 16: 0.0 + 18: 0.0020633612104619787 + 20: 0.0016218197275284021 + 30: 0.00017698551338515307 + 31: 1.1065903904919655e-08 + 32: 5.532951952459828e-09 + 40: 0.1987493871255525 + 44: 0.014717169549888214 + 48: 0.14392298360372 + 49: 0.0039048553037472045 + 50: 0.1326861944777486 + 51: 0.0723592229456223 + 52: 0.002395131480328884 + 60: 4.7084144280367186e-05 + 70: 0.26681502148037506 + 71: 0.006035012012626033 + 72: 0.07814222006271769 + 80: 0.002855498193863172 + 81: 0.0006155958086189918 + 99: 0.009923127583046915 + 252: 0.001789309418528068 + 253: 0.00012709999297008662 + 254: 0.00016059776092534436 + 255: 3.745553104802113e-05 + 256: 0.0 + 257: 0.00011351574470342043 + 258: 0.00010157861367183268 + 259: 4.3840131989471124e-05 +# classes that are indistinguishable from single scan or inconsistent in +# ground truth are mapped to their closest equivalent +learning_map: + 0 : 0 # "unlabeled" + 1 : 0 # "outlier" mapped to "unlabeled" --------------------------mapped + 10: 1 # "car" + 11: 2 # "bicycle" + 13: 5 # "bus" mapped to "other-vehicle" --------------------------mapped + 15: 3 # "motorcycle" + 16: 5 # "on-rails" mapped to "other-vehicle" ---------------------mapped + 18: 4 # "truck" + 20: 5 # "other-vehicle" + 30: 6 # "person" + 31: 7 # "bicyclist" + 32: 8 # "motorcyclist" + 40: 9 # "road" + 44: 10 # "parking" + 48: 11 # "sidewalk" + 49: 12 # "other-ground" + 50: 13 # "building" + 51: 14 # "fence" + 52: 0 # "other-structure" mapped to "unlabeled" ------------------mapped + 60: 9 # "lane-marking" to "road" ---------------------------------mapped + 70: 15 # "vegetation" + 71: 16 # "trunk" + 72: 17 # "terrain" + 80: 18 # "pole" + 81: 19 # "traffic-sign" + 99: 0 # "other-object" to "unlabeled" ----------------------------mapped + 252: 1 # "moving-car" to "car" ------------------------------------mapped + 253: 7 # "moving-bicyclist" to "bicyclist" ------------------------mapped + 254: 6 # "moving-person" to "person" ------------------------------mapped + 255: 8 # "moving-motorcyclist" to "motorcyclist" ------------------mapped + 256: 5 # "moving-on-rails" mapped to "other-vehicle" --------------mapped + 257: 5 # "moving-bus" mapped to "other-vehicle" -------------------mapped + 258: 4 # "moving-truck" to "truck" --------------------------------mapped + 259: 5 # "moving-other"-vehicle to "other-vehicle" ----------------mapped +learning_map_inv: # inverse of previous map + 0: 0 # "unlabeled", and others ignored + 1: 10 # "car" + 2: 11 # "bicycle" + 3: 15 # "motorcycle" + 4: 18 # "truck" + 5: 20 # "other-vehicle" + 6: 30 # "person" + 7: 31 # "bicyclist" + 8: 32 # "motorcyclist" + 9: 40 # "road" + 10: 44 # "parking" + 11: 48 # "sidewalk" + 12: 49 # "other-ground" + 13: 50 # "building" + 14: 51 # "fence" + 15: 70 # "vegetation" + 16: 71 # "trunk" + 17: 72 # "terrain" + 18: 80 # "pole" + 19: 81 # "traffic-sign" +learning_ignore: # Ignore classes + 0: True # "unlabeled", and others ignored + 1: False # "car" + 2: False # "bicycle" + 3: False # "motorcycle" + 4: False # "truck" + 5: False # "other-vehicle" + 6: False # "person" + 7: False # "bicyclist" + 8: False # "motorcyclist" + 9: False # "road" + 10: False # "parking" + 11: False # "sidewalk" + 12: False # "other-ground" + 13: False # "building" + 14: False # "fence" + 15: False # "vegetation" + 16: False # "trunk" + 17: False # "terrain" + 18: False # "pole" + 19: False # "traffic-sign" +split: # sequence numbers + train: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 9 + - 10 + valid: + - 8 + test: + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + - 17 + - 18 + - 19 + - 20 + - 21 diff --git a/monoscene/data/utils/fusion.py b/monoscene/data/utils/fusion.py new file mode 100644 index 0000000..aecd5cb --- /dev/null +++ b/monoscene/data/utils/fusion.py @@ -0,0 +1,507 @@ +""" +Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py + +@inproceedings{zeng20163dmatch, + title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions}, + author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas}, + booktitle={CVPR}, + year={2017} +} +""" + +import numpy as np + +from numba import njit, prange +from skimage import measure + +FUSION_GPU_MODE = 0 + + +class TSDFVolume: + """Volumetric TSDF Fusion of RGB-D Images.""" + + def __init__(self, vol_bnds, voxel_size, use_gpu=True): + """Constructor. + + Args: + vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the + xyz bounds (min/max) in meters. + voxel_size (float): The volume discretization in meters. + """ + vol_bnds = np.asarray(vol_bnds) + assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)." + + # Define voxel volume parameters + self._vol_bnds = vol_bnds + self._voxel_size = float(voxel_size) + self._trunc_margin = 5 * self._voxel_size # truncation on SDF + # self._trunc_margin = 10 # truncation on SDF + self._color_const = 256 * 256 + + # Adjust volume bounds and ensure C-order contiguous + self._vol_dim = ( + np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size) + .copy(order="C") + .astype(int) + ) + self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size + self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32) + + print( + "Voxel volume size: {} x {} x {} - # points: {:,}".format( + self._vol_dim[0], + self._vol_dim[1], + self._vol_dim[2], + self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2], + ) + ) + + # Initialize pointers to voxel volume in CPU memory + self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) + # for computing the cumulative moving average of observations per voxel + self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) + self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) + + self.gpu_mode = use_gpu and FUSION_GPU_MODE + + # Copy voxel volumes to GPU + if self.gpu_mode: + self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes) + cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu) + self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes) + cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu) + self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes) + cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu) + + # Cuda kernel function (C++) + self._cuda_src_mod = SourceModule( + """ + __global__ void integrate(float * tsdf_vol, + float * weight_vol, + float * color_vol, + float * vol_dim, + float * vol_origin, + float * cam_intr, + float * cam_pose, + float * other_params, + float * color_im, + float * depth_im) { + // Get voxel index + int gpu_loop_idx = (int) other_params[0]; + int max_threads_per_block = blockDim.x; + int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x; + int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x; + int vol_dim_x = (int) vol_dim[0]; + int vol_dim_y = (int) vol_dim[1]; + int vol_dim_z = (int) vol_dim[2]; + if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z) + return; + // Get voxel grid coordinates (note: be careful when casting) + float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z))); + float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z)); + float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z); + // Voxel grid coordinates to world coordinates + float voxel_size = other_params[1]; + float pt_x = vol_origin[0]+voxel_x*voxel_size; + float pt_y = vol_origin[1]+voxel_y*voxel_size; + float pt_z = vol_origin[2]+voxel_z*voxel_size; + // World coordinates to camera coordinates + float tmp_pt_x = pt_x-cam_pose[0*4+3]; + float tmp_pt_y = pt_y-cam_pose[1*4+3]; + float tmp_pt_z = pt_z-cam_pose[2*4+3]; + float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z; + float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z; + float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z; + // Camera coordinates to image pixels + int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]); + int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]); + // Skip if outside view frustum + int im_h = (int) other_params[2]; + int im_w = (int) other_params[3]; + if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0) + return; + // Skip invalid depth + float depth_value = depth_im[pixel_y*im_w+pixel_x]; + if (depth_value == 0) + return; + // Integrate TSDF + float trunc_margin = other_params[4]; + float depth_diff = depth_value-cam_pt_z; + if (depth_diff < -trunc_margin) + return; + float dist = fmin(1.0f,depth_diff/trunc_margin); + float w_old = weight_vol[voxel_idx]; + float obs_weight = other_params[5]; + float w_new = w_old + obs_weight; + weight_vol[voxel_idx] = w_new; + tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new; + // Integrate color + float old_color = color_vol[voxel_idx]; + float old_b = floorf(old_color/(256*256)); + float old_g = floorf((old_color-old_b*256*256)/256); + float old_r = old_color-old_b*256*256-old_g*256; + float new_color = color_im[pixel_y*im_w+pixel_x]; + float new_b = floorf(new_color/(256*256)); + float new_g = floorf((new_color-new_b*256*256)/256); + float new_r = new_color-new_b*256*256-new_g*256; + new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f); + new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f); + new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f); + color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r; + }""" + ) + + self._cuda_integrate = self._cuda_src_mod.get_function("integrate") + + # Determine block/grid size on GPU + gpu_dev = cuda.Device(0) + self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK + n_blocks = int( + np.ceil( + float(np.prod(self._vol_dim)) + / float(self._max_gpu_threads_per_block) + ) + ) + grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks)))) + grid_dim_y = min( + gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x))) + ) + grid_dim_z = min( + gpu_dev.MAX_GRID_DIM_Z, + int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))), + ) + self._max_gpu_grid_dim = np.array( + [grid_dim_x, grid_dim_y, grid_dim_z] + ).astype(int) + self._n_gpu_loops = int( + np.ceil( + float(np.prod(self._vol_dim)) + / float( + np.prod(self._max_gpu_grid_dim) + * self._max_gpu_threads_per_block + ) + ) + ) + + else: + # Get voxel grid coordinates + xv, yv, zv = np.meshgrid( + range(self._vol_dim[0]), + range(self._vol_dim[1]), + range(self._vol_dim[2]), + indexing="ij", + ) + self.vox_coords = ( + np.concatenate( + [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0 + ) + .astype(int) + .T + ) + + @staticmethod + @njit(parallel=True) + def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)): + """Convert voxel grid coordinates to world coordinates.""" + vol_origin = vol_origin.astype(np.float32) + vox_coords = vox_coords.astype(np.float32) + # print(np.min(vox_coords)) + cam_pts = np.empty_like(vox_coords, dtype=np.float32) + + for i in prange(vox_coords.shape[0]): + for j in range(3): + cam_pts[i, j] = ( + vol_origin[j] + + (vox_size * vox_coords[i, j]) + + vox_size * offsets[j] + ) + return cam_pts + + @staticmethod + @njit(parallel=True) + def cam2pix(cam_pts, intr): + """Convert camera coordinates to pixel coordinates.""" + intr = intr.astype(np.float32) + fx, fy = intr[0, 0], intr[1, 1] + cx, cy = intr[0, 2], intr[1, 2] + pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64) + for i in prange(cam_pts.shape[0]): + pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx)) + pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy)) + return pix + + @staticmethod + @njit(parallel=True) + def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight): + """Integrate the TSDF volume.""" + tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32) + # print(tsdf_vol.shape) + w_new = np.empty_like(w_old, dtype=np.float32) + for i in prange(len(tsdf_vol)): + w_new[i] = w_old[i] + obs_weight + tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i] + return tsdf_vol_int, w_new + + def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0): + """Integrate an RGB-D frame into the TSDF volume. + + Args: + color_im (ndarray): An RGB image of shape (H, W, 3). + depth_im (ndarray): A depth image of shape (H, W). + cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3). + cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4). + obs_weight (float): The weight to assign for the current observation. A higher + value + """ + im_h, im_w = depth_im.shape + + # Fold RGB color image into a single channel image + color_im = color_im.astype(np.float32) + color_im = np.floor( + color_im[..., 2] * self._color_const + + color_im[..., 1] * 256 + + color_im[..., 0] + ) + + if self.gpu_mode: # GPU mode: integrate voxel volume (calls CUDA kernel) + for gpu_loop_idx in range(self._n_gpu_loops): + self._cuda_integrate( + self._tsdf_vol_gpu, + self._weight_vol_gpu, + self._color_vol_gpu, + cuda.InOut(self._vol_dim.astype(np.float32)), + cuda.InOut(self._vol_origin.astype(np.float32)), + cuda.InOut(cam_intr.reshape(-1).astype(np.float32)), + cuda.InOut(cam_pose.reshape(-1).astype(np.float32)), + cuda.InOut( + np.asarray( + [ + gpu_loop_idx, + self._voxel_size, + im_h, + im_w, + self._trunc_margin, + obs_weight, + ], + np.float32, + ) + ), + cuda.InOut(color_im.reshape(-1).astype(np.float32)), + cuda.InOut(depth_im.reshape(-1).astype(np.float32)), + block=(self._max_gpu_threads_per_block, 1, 1), + grid=( + int(self._max_gpu_grid_dim[0]), + int(self._max_gpu_grid_dim[1]), + int(self._max_gpu_grid_dim[2]), + ), + ) + else: # CPU mode: integrate voxel volume (vectorized implementation) + # Convert voxel grid coordinates to pixel coordinates + cam_pts = self.vox2world( + self._vol_origin, self.vox_coords, self._voxel_size + ) + cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose)) + pix_z = cam_pts[:, 2] + pix = self.cam2pix(cam_pts, cam_intr) + pix_x, pix_y = pix[:, 0], pix[:, 1] + + # Eliminate pixels outside view frustum + valid_pix = np.logical_and( + pix_x >= 0, + np.logical_and( + pix_x < im_w, + np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)), + ), + ) + depth_val = np.zeros(pix_x.shape) + depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]] + + # Integrate TSDF + depth_diff = depth_val - pix_z + + valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10) + dist = depth_diff + + valid_vox_x = self.vox_coords[valid_pts, 0] + valid_vox_y = self.vox_coords[valid_pts, 1] + valid_vox_z = self.vox_coords[valid_pts, 2] + w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] + tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] + valid_dist = dist[valid_pts] + tsdf_vol_new, w_new = self.integrate_tsdf( + tsdf_vals, valid_dist, w_old, obs_weight + ) + self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new + self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new + + # Integrate color + old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] + old_b = np.floor(old_color / self._color_const) + old_g = np.floor((old_color - old_b * self._color_const) / 256) + old_r = old_color - old_b * self._color_const - old_g * 256 + new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]] + new_b = np.floor(new_color / self._color_const) + new_g = np.floor((new_color - new_b * self._color_const) / 256) + new_r = new_color - new_b * self._color_const - new_g * 256 + new_b = np.minimum( + 255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new) + ) + new_g = np.minimum( + 255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new) + ) + new_r = np.minimum( + 255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new) + ) + self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = ( + new_b * self._color_const + new_g * 256 + new_r + ) + + def get_volume(self): + if self.gpu_mode: + cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu) + cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu) + return self._tsdf_vol_cpu, self._color_vol_cpu + + def get_point_cloud(self): + """Extract a point cloud from the voxel volume.""" + tsdf_vol, color_vol = self.get_volume() + + # Marching cubes + verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0] + verts_ind = np.round(verts).astype(int) + verts = verts * self._voxel_size + self._vol_origin + + # Get vertex colors + rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]] + colors_b = np.floor(rgb_vals / self._color_const) + colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256) + colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256 + colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T + colors = colors.astype(np.uint8) + + pc = np.hstack([verts, colors]) + return pc + + def get_mesh(self): + """Compute a mesh from the voxel volume using marching cubes.""" + tsdf_vol, color_vol = self.get_volume() + + # Marching cubes + verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0) + verts_ind = np.round(verts).astype(int) + verts = ( + verts * self._voxel_size + self._vol_origin + ) # voxel grid coordinates to world coordinates + + # Get vertex colors + rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]] + colors_b = np.floor(rgb_vals / self._color_const) + colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256) + colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256 + colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T + colors = colors.astype(np.uint8) + return verts, faces, norms, colors + + +def rigid_transform(xyz, transform): + """Applies a rigid transform to an (N, 3) pointcloud.""" + xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)]) + xyz_t_h = np.dot(transform, xyz_h.T).T + return xyz_t_h[:, :3] + + +def get_view_frustum(depth_im, cam_intr, cam_pose): + """Get corners of 3D camera view frustum of depth image""" + im_h = depth_im.shape[0] + im_w = depth_im.shape[1] + max_depth = np.max(depth_im) + view_frust_pts = np.array( + [ + (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2]) + * np.array([0, max_depth, max_depth, max_depth, max_depth]) + / cam_intr[0, 0], + (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2]) + * np.array([0, max_depth, max_depth, max_depth, max_depth]) + / cam_intr[1, 1], + np.array([0, max_depth, max_depth, max_depth, max_depth]), + ] + ) + view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T + return view_frust_pts + + +def meshwrite(filename, verts, faces, norms, colors): + """Save a 3D mesh to a polygon .ply file.""" + # Write header + ply_file = open(filename, "w") + ply_file.write("ply\n") + ply_file.write("format ascii 1.0\n") + ply_file.write("element vertex %d\n" % (verts.shape[0])) + ply_file.write("property float x\n") + ply_file.write("property float y\n") + ply_file.write("property float z\n") + ply_file.write("property float nx\n") + ply_file.write("property float ny\n") + ply_file.write("property float nz\n") + ply_file.write("property uchar red\n") + ply_file.write("property uchar green\n") + ply_file.write("property uchar blue\n") + ply_file.write("element face %d\n" % (faces.shape[0])) + ply_file.write("property list uchar int vertex_index\n") + ply_file.write("end_header\n") + + # Write vertex list + for i in range(verts.shape[0]): + ply_file.write( + "%f %f %f %f %f %f %d %d %d\n" + % ( + verts[i, 0], + verts[i, 1], + verts[i, 2], + norms[i, 0], + norms[i, 1], + norms[i, 2], + colors[i, 0], + colors[i, 1], + colors[i, 2], + ) + ) + + # Write face list + for i in range(faces.shape[0]): + ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2])) + + ply_file.close() + + +def pcwrite(filename, xyzrgb): + """Save a point cloud to a polygon .ply file.""" + xyz = xyzrgb[:, :3] + rgb = xyzrgb[:, 3:].astype(np.uint8) + + # Write header + ply_file = open(filename, "w") + ply_file.write("ply\n") + ply_file.write("format ascii 1.0\n") + ply_file.write("element vertex %d\n" % (xyz.shape[0])) + ply_file.write("property float x\n") + ply_file.write("property float y\n") + ply_file.write("property float z\n") + ply_file.write("property uchar red\n") + ply_file.write("property uchar green\n") + ply_file.write("property uchar blue\n") + ply_file.write("end_header\n") + + # Write vertex list + for i in range(xyz.shape[0]): + ply_file.write( + "%f %f %f %d %d %d\n" + % ( + xyz[i, 0], + xyz[i, 1], + xyz[i, 2], + rgb[i, 0], + rgb[i, 1], + rgb[i, 2], + ) + ) diff --git a/monoscene/data/utils/helpers.py b/monoscene/data/utils/helpers.py new file mode 100644 index 0000000..8da3264 --- /dev/null +++ b/monoscene/data/utils/helpers.py @@ -0,0 +1,185 @@ +import numpy as np +import monoscene.data.utils.fusion as fusion +import torch + + +def compute_CP_mega_matrix(target, is_binary=False): + """ + Parameters + --------- + target: (H, W, D) + contains voxels semantic labels + + is_binary: bool + if True, return binary voxels relations else return 4-way relations + """ + label = target.reshape(-1) + label_row = label + N = label.shape[0] + super_voxel_size = [i//2 for i in target.shape] + if is_binary: + matrix = np.zeros((2, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8) + else: + matrix = np.zeros((4, N, super_voxel_size[0] * super_voxel_size[1] * super_voxel_size[2]), dtype=np.uint8) + + for xx in range(super_voxel_size[0]): + for yy in range(super_voxel_size[1]): + for zz in range(super_voxel_size[2]): + col_idx = xx * (super_voxel_size[1] * super_voxel_size[2]) + yy * super_voxel_size[2] + zz + label_col_megas = np.array([ + target[xx * 2, yy * 2, zz * 2], + target[xx * 2 + 1, yy * 2, zz * 2], + target[xx * 2, yy * 2 + 1, zz * 2], + target[xx * 2, yy * 2, zz * 2 + 1], + target[xx * 2 + 1, yy * 2 + 1, zz * 2], + target[xx * 2 + 1, yy * 2, zz * 2 + 1], + target[xx * 2, yy * 2 + 1, zz * 2 + 1], + target[xx * 2 + 1, yy * 2 + 1, zz * 2 + 1], + ]) + label_col_megas = label_col_megas[label_col_megas != 255] + for label_col_mega in label_col_megas: + label_col = np.ones(N) * label_col_mega + if not is_binary: + matrix[0, (label_row != 255) & (label_col == label_row) & (label_col != 0), col_idx] = 1.0 # non non same + matrix[1, (label_row != 255) & (label_col != label_row) & (label_col != 0) & (label_row != 0), col_idx] = 1.0 # non non diff + matrix[2, (label_row != 255) & (label_row == label_col) & (label_col == 0), col_idx] = 1.0 # empty empty + matrix[3, (label_row != 255) & (label_row != label_col) & ((label_row == 0) | (label_col == 0)), col_idx] = 1.0 # nonempty empty + else: + matrix[0, (label_row != 255) & (label_col != label_row), col_idx] = 1.0 # diff + matrix[1, (label_row != 255) & (label_col == label_row), col_idx] = 1.0 # same + return matrix + + +def vox2pix(cam_E, cam_k, + vox_origin, voxel_size, + img_W, img_H, + scene_size): + """ + compute the 2D projection of voxels centroids + + Parameters: + ---------- + cam_E: 4x4 + =camera pose in case of NYUv2 dataset + =Transformation from camera to lidar coordinate in case of SemKITTI + cam_k: 3x3 + camera intrinsics + vox_origin: (3,) + world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0) + img_W: int + image width + img_H: int + image height + scene_size: (3,) + scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2 + + Returns + ------- + projected_pix: (N, 2) + Projected 2D positions of voxels + fov_mask: (N,) + Voxels mask indice voxels inside image's FOV + sensor_distance: (N,) + Voxels'distance to the sensor in meter + """ + # Compute the x, y, z bounding of the scene in meter + vol_bnds = np.zeros((3,2)) + vol_bnds[:,0] = vox_origin + vol_bnds[:,1] = vox_origin + np.array(scene_size) + + # Compute the voxels centroids in lidar cooridnates + vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int) + xv, yv, zv = np.meshgrid( + range(vol_dim[0]), + range(vol_dim[1]), + range(vol_dim[2]), + indexing='ij' + ) + vox_coords = np.concatenate([ + xv.reshape(1,-1), + yv.reshape(1,-1), + zv.reshape(1,-1) + ], axis=0).astype(int).T + + # Project voxels'centroid from lidar coordinates to camera coordinates + cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size) + cam_pts = fusion.rigid_transform(cam_pts, cam_E) + + # Project camera coordinates to pixel positions + projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k) + pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1] + + # Eliminate pixels outside view frustum + sensor_distance = cam_pts[:, 2] + fov_mask = np.logical_and(pix_x >= 0, + np.logical_and(pix_x < img_W, + np.logical_and(pix_y >= 0, + np.logical_and(pix_y < img_H, + sensor_distance > 0)))) + + + return projected_pix, fov_mask, sensor_distance + + +def compute_local_frustum(pix_x, pix_y, min_x, max_x, min_y, max_y, sensor_distance): + valid_pix = np.logical_and(pix_x >= min_x, + np.logical_and(pix_x < max_x, + np.logical_and(pix_y >= min_y, + np.logical_and(pix_y < max_y, + sensor_distance > 0)))) + return valid_pix + +def compute_local_frustums(projected_pix, sensor_distance, target, img_W, img_H, dataset, n_classes, size=4): + """ + Compute the local frustums mask and their class frequencies + + Parameters: + ---------- + projected_pix: (N, 2) + 2D projected pix of all voxels + sensor_distance: (N,) + Distance of the camera sensor to voxels + target: (H, W, D) + Voxelized sematic labels + img_W: int + Image width + img_H: int + Image height + dataset: str + ="NYU" or "kitti" (for both SemKITTI and KITTI-360) + n_classes: int + Number of classes (12 for NYU and 20 for SemKITTI) + size: int + determine the number of local frustums i.e. size * size + + Returns + ------- + frustums_masks: (n_frustums, N) + List of frustums_masks, each indicates the belonging voxels + frustums_class_dists: (n_frustums, n_classes) + Contains the class frequencies in each frustum + """ + H, W, D = target.shape + ranges = [(i * 1.0/size, (i * 1.0 + 1)/size) for i in range(size)] + local_frustum_masks = [] + local_frustum_class_dists = [] + pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1] + for y in ranges: + for x in ranges: + start_x = x[0] * img_W + end_x = x[1] * img_W + start_y = y[0] * img_H + end_y = y[1] * img_H + local_frustum = compute_local_frustum(pix_x, pix_y, start_x, end_x, start_y, end_y, sensor_distance) + if dataset == "NYU": + mask = (target != 255) & np.moveaxis(local_frustum.reshape(60, 60, 36), [0, 1, 2], [0, 2, 1]) + elif dataset == "kitti": + mask = (target != 255) & local_frustum.reshape(H, W, D) + + local_frustum_masks.append(mask) + classes, cnts = np.unique(target[mask], return_counts=True) + class_counts = np.zeros(n_classes) + class_counts[classes.astype(int)] = cnts + local_frustum_class_dists.append(class_counts) + frustums_masks, frustums_class_dists = np.array(local_frustum_masks), np.array(local_frustum_class_dists) + return frustums_masks, frustums_class_dists diff --git a/monoscene/data/utils/torch_util.py b/monoscene/data/utils/torch_util.py new file mode 100644 index 0000000..a7d7ffd --- /dev/null +++ b/monoscene/data/utils/torch_util.py @@ -0,0 +1,15 @@ +import numpy as np +import torch + + +def worker_init_fn(worker_id): + """The function is designed for pytorch multi-process dataloader. + Note that we use the pytorch random generator to generate a base_seed. + Please try to be consistent. + + References: + https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed + + """ + base_seed = torch.IntTensor(1).random_().item() + np.random.seed(base_seed + worker_id) diff --git a/monoscene/loss/CRP_loss.py b/monoscene/loss/CRP_loss.py new file mode 100644 index 0000000..b1e2797 --- /dev/null +++ b/monoscene/loss/CRP_loss.py @@ -0,0 +1,24 @@ +import torch + + +def compute_super_CP_multilabel_loss(pred_logits, CP_mega_matrices): + logits = [] + labels = [] + bs, n_relations, _, _ = pred_logits.shape + for i in range(bs): + pred_logit = pred_logits[i, :, :, :].permute( + 0, 2, 1 + ) # n_relations, N, n_mega_voxels + CP_mega_matrix = CP_mega_matrices[i] # n_relations, N, n_mega_voxels + logits.append(pred_logit.reshape(n_relations, -1)) + labels.append(CP_mega_matrix.reshape(n_relations, -1)) + + logits = torch.cat(logits, dim=1).T # M, 4 + labels = torch.cat(labels, dim=1).T # M, 4 + + cnt_neg = (labels == 0).sum(0) + cnt_pos = labels.sum(0) + pos_weight = cnt_neg / cnt_pos + criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight) + loss_bce = criterion(logits, labels.float()) + return loss_bce diff --git a/monoscene/loss/sscMetrics.py b/monoscene/loss/sscMetrics.py new file mode 100644 index 0000000..f77be70 --- /dev/null +++ b/monoscene/loss/sscMetrics.py @@ -0,0 +1,204 @@ +""" +Part of the code is taken from https://github.com/waterljwant/SSC/blob/master/sscMetrics.py +""" +import numpy as np +from sklearn.metrics import accuracy_score, precision_recall_fscore_support + + +def get_iou(iou_sum, cnt_class): + _C = iou_sum.shape[0] # 12 + iou = np.zeros(_C, dtype=np.float32) # iou for each class + for idx in range(_C): + iou[idx] = iou_sum[idx] / cnt_class[idx] if cnt_class[idx] else 0 + + mean_iou = np.sum(iou[1:]) / np.count_nonzero(cnt_class[1:]) + return iou, mean_iou + + +def get_accuracy(predict, target, weight=None): # 0.05s + _bs = predict.shape[0] # batch size + _C = predict.shape[1] # _C = 12 + target = np.int32(target) + target = target.reshape(_bs, -1) # (_bs, 60*36*60) 129600 + predict = predict.reshape(_bs, _C, -1) # (_bs, _C, 60*36*60) + predict = np.argmax( + predict, axis=1 + ) # one-hot: _bs x _C x 60*36*60 --> label: _bs x 60*36*60. + + correct = predict == target # (_bs, 129600) + if weight: # 0.04s, add class weights + weight_k = np.ones(target.shape) + for i in range(_bs): + for n in range(target.shape[1]): + idx = 0 if target[i, n] == 255 else target[i, n] + weight_k[i, n] = weight[idx] + correct = correct * weight_k + acc = correct.sum() / correct.size + return acc + + +class SSCMetrics: + def __init__(self, n_classes): + self.n_classes = n_classes + self.reset() + + def hist_info(self, n_cl, pred, gt): + assert pred.shape == gt.shape + k = (gt >= 0) & (gt < n_cl) # exclude 255 + labeled = np.sum(k) + correct = np.sum((pred[k] == gt[k])) + + return ( + np.bincount( + n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2 + ).reshape(n_cl, n_cl), + correct, + labeled, + ) + + @staticmethod + def compute_score(hist, correct, labeled): + iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) + mean_IU = np.nanmean(iu) + mean_IU_no_back = np.nanmean(iu[1:]) + freq = hist.sum(1) / hist.sum() + freq_IU = (iu[freq > 0] * freq[freq > 0]).sum() + mean_pixel_acc = correct / labeled if labeled != 0 else 0 + + return iu, mean_IU, mean_IU_no_back, mean_pixel_acc + + def add_batch(self, y_pred, y_true, nonempty=None, nonsurface=None): + self.count += 1 + mask = y_true != 255 + if nonempty is not None: + mask = mask & nonempty + if nonsurface is not None: + mask = mask & nonsurface + tp, fp, fn = self.get_score_completion(y_pred, y_true, mask) + + self.completion_tp += tp + self.completion_fp += fp + self.completion_fn += fn + + mask = y_true != 255 + if nonempty is not None: + mask = mask & nonempty + tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion( + y_pred, y_true, mask + ) + self.tps += tp_sum + self.fps += fp_sum + self.fns += fn_sum + + def get_stats(self): + if self.completion_tp != 0: + precision = self.completion_tp / (self.completion_tp + self.completion_fp) + recall = self.completion_tp / (self.completion_tp + self.completion_fn) + iou = self.completion_tp / ( + self.completion_tp + self.completion_fp + self.completion_fn + ) + else: + precision, recall, iou = 0, 0, 0 + iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5) + return { + "precision": precision, + "recall": recall, + "iou": iou, + "iou_ssc": iou_ssc, + "iou_ssc_mean": np.mean(iou_ssc[1:]), + } + + def reset(self): + + self.completion_tp = 0 + self.completion_fp = 0 + self.completion_fn = 0 + self.tps = np.zeros(self.n_classes) + self.fps = np.zeros(self.n_classes) + self.fns = np.zeros(self.n_classes) + + self.hist_ssc = np.zeros((self.n_classes, self.n_classes)) + self.labeled_ssc = 0 + self.correct_ssc = 0 + + self.precision = 0 + self.recall = 0 + self.iou = 0 + self.count = 1e-8 + self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32) + self.cnt_class = np.zeros(self.n_classes, dtype=np.float32) + + def get_score_completion(self, predict, target, nonempty=None): + predict = np.copy(predict) + target = np.copy(target) + + """for scene completion, treat the task as two-classes problem, just empty or occupancy""" + _bs = predict.shape[0] # batch size + # ---- ignore + predict[target == 255] = 0 + target[target == 255] = 0 + # ---- flatten + target = target.reshape(_bs, -1) # (_bs, 129600) + predict = predict.reshape(_bs, -1) # (_bs, _C, 129600), 60*36*60=129600 + # ---- treat all non-empty object class as one category, set them to label 1 + b_pred = np.zeros(predict.shape) + b_true = np.zeros(target.shape) + b_pred[predict > 0] = 1 + b_true[target > 0] = 1 + p, r, iou = 0.0, 0.0, 0.0 + tp_sum, fp_sum, fn_sum = 0, 0, 0 + for idx in range(_bs): + y_true = b_true[idx, :] # GT + y_pred = b_pred[idx, :] + if nonempty is not None: + nonempty_idx = nonempty[idx, :].reshape(-1) + y_true = y_true[nonempty_idx == 1] + y_pred = y_pred[nonempty_idx == 1] + + tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size + fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size + fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size + tp_sum += tp + fp_sum += fp + fn_sum += fn + return tp_sum, fp_sum, fn_sum + + def get_score_semantic_and_completion(self, predict, target, nonempty=None): + target = np.copy(target) + predict = np.copy(predict) + _bs = predict.shape[0] # batch size + _C = self.n_classes # _C = 12 + # ---- ignore + predict[target == 255] = 0 + target[target == 255] = 0 + # ---- flatten + target = target.reshape(_bs, -1) # (_bs, 129600) + predict = predict.reshape(_bs, -1) # (_bs, 129600), 60*36*60=129600 + + cnt_class = np.zeros(_C, dtype=np.int32) # count for each class + iou_sum = np.zeros(_C, dtype=np.float32) # sum of iou for each class + tp_sum = np.zeros(_C, dtype=np.int32) # tp + fp_sum = np.zeros(_C, dtype=np.int32) # fp + fn_sum = np.zeros(_C, dtype=np.int32) # fn + + for idx in range(_bs): + y_true = target[idx, :] # GT + y_pred = predict[idx, :] + if nonempty is not None: + nonempty_idx = nonempty[idx, :].reshape(-1) + y_pred = y_pred[ + np.where(np.logical_and(nonempty_idx == 1, y_true != 255)) + ] + y_true = y_true[ + np.where(np.logical_and(nonempty_idx == 1, y_true != 255)) + ] + for j in range(_C): # for each class + tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size + fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size + fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size + + tp_sum[j] += tp + fp_sum[j] += fp + fn_sum[j] += fn + + return tp_sum, fp_sum, fn_sum diff --git a/monoscene/loss/ssc_loss.py b/monoscene/loss/ssc_loss.py new file mode 100644 index 0000000..941d2ee --- /dev/null +++ b/monoscene/loss/ssc_loss.py @@ -0,0 +1,99 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def KL_sep(p, target): + """ + KL divergence on nonzeros classes + """ + nonzeros = target != 0 + nonzero_p = p[nonzeros] + kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum") + return kl_term + + +def geo_scal_loss(pred, ssc_target): + + # Get softmax probabilities + pred = F.softmax(pred, dim=1) + + # Compute empty and nonempty probabilities + empty_probs = pred[:, 0, :, :, :] + nonempty_probs = 1 - empty_probs + + # Remove unknown voxels + mask = ssc_target != 255 + nonempty_target = ssc_target != 0 + nonempty_target = nonempty_target[mask].float() + nonempty_probs = nonempty_probs[mask] + empty_probs = empty_probs[mask] + + intersection = (nonempty_target * nonempty_probs).sum() + precision = intersection / nonempty_probs.sum() + recall = intersection / nonempty_target.sum() + spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum() + return ( + F.binary_cross_entropy(precision, torch.ones_like(precision)) + + F.binary_cross_entropy(recall, torch.ones_like(recall)) + + F.binary_cross_entropy(spec, torch.ones_like(spec)) + ) + + +def sem_scal_loss(pred, ssc_target): + # Get softmax probabilities + pred = F.softmax(pred, dim=1) + loss = 0 + count = 0 + mask = ssc_target != 255 + n_classes = pred.shape[1] + for i in range(0, n_classes): + + # Get probability of class i + p = pred[:, i, :, :, :] + + # Remove unknown voxels + target_ori = ssc_target + p = p[mask] + target = ssc_target[mask] + + completion_target = torch.ones_like(target) + completion_target[target != i] = 0 + completion_target_ori = torch.ones_like(target_ori).float() + completion_target_ori[target_ori != i] = 0 + if torch.sum(completion_target) > 0: + count += 1.0 + nominator = torch.sum(p * completion_target) + loss_class = 0 + if torch.sum(p) > 0: + precision = nominator / (torch.sum(p)) + loss_precision = F.binary_cross_entropy( + precision, torch.ones_like(precision) + ) + loss_class += loss_precision + if torch.sum(completion_target) > 0: + recall = nominator / (torch.sum(completion_target)) + loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall)) + loss_class += loss_recall + if torch.sum(1 - completion_target) > 0: + specificity = torch.sum((1 - p) * (1 - completion_target)) / ( + torch.sum(1 - completion_target) + ) + loss_specificity = F.binary_cross_entropy( + specificity, torch.ones_like(specificity) + ) + loss_class += loss_specificity + loss += loss_class + return loss / count + + +def CE_ssc_loss(pred, target, class_weights): + """ + :param: prediction: the predicted tensor, must be [BS, C, H, W, D] + """ + criterion = nn.CrossEntropyLoss( + weight=class_weights, ignore_index=255, reduction="mean" + ) + loss = criterion(pred, target.long()) + + return loss diff --git a/monoscene/models/CRP3D.py b/monoscene/models/CRP3D.py new file mode 100644 index 0000000..682a00c --- /dev/null +++ b/monoscene/models/CRP3D.py @@ -0,0 +1,97 @@ +import torch +import torch.nn as nn +from monoscene.models.modules import ( + Process, + ASPP, +) + + +class CPMegaVoxels(nn.Module): + def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003): + super().__init__() + self.size = size + self.n_relations = n_relations + print("n_relations", self.n_relations) + self.flatten_size = size[0] * size[1] * size[2] + self.feature = feature + self.context_feature = feature * 2 + self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2) + padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2) + + self.mega_context = nn.Sequential( + nn.Conv3d( + feature, self.context_feature, stride=2, padding=padding, kernel_size=3 + ), + ) + self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2) + + self.context_prior_logits = nn.ModuleList( + [ + nn.Sequential( + nn.Conv3d( + self.feature, + self.flatten_context_size, + padding=0, + kernel_size=1, + ), + ) + for i in range(n_relations) + ] + ) + self.aspp = ASPP(feature, [1, 2, 3]) + + self.resize = nn.Sequential( + nn.Conv3d( + self.context_feature * self.n_relations + feature, + feature, + kernel_size=1, + padding=0, + bias=False, + ), + Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]), + ) + + def forward(self, input): + ret = {} + bs = input.shape[0] + + x_agg = self.aspp(input) + + # get the mega context + x_mega_context_raw = self.mega_context(x_agg) + x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1) + x_mega_context = x_mega_context.permute(0, 2, 1) + + # get context prior map + x_context_prior_logits = [] + x_context_rels = [] + for rel in range(self.n_relations): + + # Compute the relation matrices + x_context_prior_logit = self.context_prior_logits[rel](x_agg) + x_context_prior_logit = x_context_prior_logit.reshape( + bs, self.flatten_context_size, self.flatten_size + ) + x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1)) + + x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1) + x_context_prior = torch.sigmoid(x_context_prior_logit) + + # Multiply the relation matrices with the mega context to gather context features + x_context_rel = torch.bmm(x_context_prior, x_mega_context) # bs, N, f + x_context_rels.append(x_context_rel) + + x_context = torch.cat(x_context_rels, dim=2) + x_context = x_context.permute(0, 2, 1) + x_context = x_context.reshape( + bs, x_context.shape[1], self.size[0], self.size[1], self.size[2] + ) + + x = torch.cat([input, x_context], dim=1) + x = self.resize(x) + + x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1) + ret["P_logits"] = x_context_prior_logits + ret["x"] = x + + return ret diff --git a/monoscene/models/DDR.py b/monoscene/models/DDR.py new file mode 100644 index 0000000..0d0928c --- /dev/null +++ b/monoscene/models/DDR.py @@ -0,0 +1,139 @@ +""" +Most of the code in this file is taken from https://github.com/waterljwant/SSC/blob/master/models/DDR.py +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SimpleRB(nn.Module): + def __init__(self, in_channel, norm_layer, bn_momentum): + super(SimpleRB, self).__init__() + self.path = nn.Sequential( + nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False), + norm_layer(in_channel, momentum=bn_momentum), + nn.ReLU(), + nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False), + norm_layer(in_channel, momentum=bn_momentum), + ) + self.relu = nn.ReLU() + + def forward(self, x): + residual = x + conv_path = self.path(x) + out = residual + conv_path + out = self.relu(out) + return out + + +""" +3D Residual Block,3x3x3 conv ==> 3 smaller 3D conv, refered from DDRNet +""" + + +class Bottleneck3D(nn.Module): + def __init__( + self, + inplanes, + planes, + norm_layer, + stride=1, + dilation=[1, 1, 1], + expansion=4, + downsample=None, + fist_dilation=1, + multi_grid=1, + bn_momentum=0.0003, + ): + super(Bottleneck3D, self).__init__() + # often,planes = inplanes // 4 + self.expansion = expansion + self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = norm_layer(planes, momentum=bn_momentum) + self.conv2 = nn.Conv3d( + planes, + planes, + kernel_size=(1, 1, 3), + stride=(1, 1, stride), + dilation=(1, 1, dilation[0]), + padding=(0, 0, dilation[0]), + bias=False, + ) + self.bn2 = norm_layer(planes, momentum=bn_momentum) + self.conv3 = nn.Conv3d( + planes, + planes, + kernel_size=(1, 3, 1), + stride=(1, stride, 1), + dilation=(1, dilation[1], 1), + padding=(0, dilation[1], 0), + bias=False, + ) + self.bn3 = norm_layer(planes, momentum=bn_momentum) + self.conv4 = nn.Conv3d( + planes, + planes, + kernel_size=(3, 1, 1), + stride=(stride, 1, 1), + dilation=(dilation[2], 1, 1), + padding=(dilation[2], 0, 0), + bias=False, + ) + self.bn4 = norm_layer(planes, momentum=bn_momentum) + self.conv5 = nn.Conv3d( + planes, planes * self.expansion, kernel_size=(1, 1, 1), bias=False + ) + self.bn5 = norm_layer(planes * self.expansion, momentum=bn_momentum) + + self.relu = nn.ReLU(inplace=False) + self.relu_inplace = nn.ReLU(inplace=True) + self.downsample = downsample + self.dilation = dilation + self.stride = stride + + self.downsample2 = nn.Sequential( + nn.AvgPool3d(kernel_size=(1, stride, 1), stride=(1, stride, 1)), + nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False), + norm_layer(planes, momentum=bn_momentum), + ) + self.downsample3 = nn.Sequential( + nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)), + nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False), + norm_layer(planes, momentum=bn_momentum), + ) + self.downsample4 = nn.Sequential( + nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)), + nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False), + norm_layer(planes, momentum=bn_momentum), + ) + + def forward(self, x): + residual = x + + out1 = self.relu(self.bn1(self.conv1(x))) + out2 = self.bn2(self.conv2(out1)) + out2_relu = self.relu(out2) + + out3 = self.bn3(self.conv3(out2_relu)) + if self.stride != 1: + out2 = self.downsample2(out2) + out3 = out3 + out2 + out3_relu = self.relu(out3) + + out4 = self.bn4(self.conv4(out3_relu)) + if self.stride != 1: + out2 = self.downsample3(out2) + out3 = self.downsample4(out3) + out4 = out4 + out2 + out3 + + out4_relu = self.relu(out4) + out5 = self.bn5(self.conv5(out4_relu)) + + if self.downsample is not None: + residual = self.downsample(x) + + out = out5 + residual + out_relu = self.relu(out) + + return out_relu diff --git a/monoscene/models/flosp.py b/monoscene/models/flosp.py new file mode 100644 index 0000000..2d50219 --- /dev/null +++ b/monoscene/models/flosp.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn + + +class FLoSP(nn.Module): + def __init__(self, scene_size, dataset, project_scale): + super().__init__() + self.scene_size = scene_size + self.dataset = dataset + self.project_scale = project_scale + + def forward(self, x2d, projected_pix, fov_mask): + c, h, w = x2d.shape + + src = x2d.view(c, -1) + zeros_vec = torch.zeros(c, 1).type_as(src) + src = torch.cat([src, zeros_vec], 1) + + pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1] + img_indices = pix_y * w + pix_x + img_indices[~fov_mask] = h * w + img_indices = img_indices.expand(c, -1).long() # c, HWD + src_feature = torch.gather(src, 1, img_indices) + + if self.dataset == "NYU": + x3d = src_feature.reshape( + c, + self.scene_size[0] // self.project_scale, + self.scene_size[2] // self.project_scale, + self.scene_size[1] // self.project_scale, + ) + x3d = x3d.permute(0, 1, 3, 2) + elif self.dataset == "kitti": + x3d = src_feature.reshape( + c, + self.scene_size[0] // self.project_scale, + self.scene_size[1] // self.project_scale, + self.scene_size[2] // self.project_scale, + ) + + return x3d diff --git a/monoscene/models/modules.py b/monoscene/models/modules.py new file mode 100644 index 0000000..42a892c --- /dev/null +++ b/monoscene/models/modules.py @@ -0,0 +1,194 @@ +import torch +import torch.nn as nn +from monoscene.models.DDR import Bottleneck3D + + +class ASPP(nn.Module): + """ + ASPP 3D + Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7 + """ + + def __init__(self, planes, dilations_conv_list): + super().__init__() + + # ASPP Block + self.conv_list = dilations_conv_list + self.conv1 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn1 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.conv2 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn2 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.relu = nn.ReLU() + + def forward(self, x_in): + + y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in))))) + for i in range(1, len(self.conv_list)): + y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in))))) + x_in = self.relu(y + x_in) # modified + + return x_in + + +class SegmentationHead(nn.Module): + """ + 3D Segmentation heads to retrieve semantic segmentation at each scale. + Formed by Dim expansion, Conv3D, ASPP block, Conv3D. + Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7 + """ + + def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list): + super().__init__() + + # First convolution + self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1) + + # ASPP Block + self.conv_list = dilations_conv_list + self.conv1 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn1 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.conv2 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn2 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.relu = nn.ReLU() + + self.conv_classes = nn.Conv3d( + planes, nbr_classes, kernel_size=3, padding=1, stride=1 + ) + + def forward(self, x_in): + + # Convolution to go from inplanes to planes features... + x_in = self.relu(self.conv0(x_in)) + + y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in))))) + for i in range(1, len(self.conv_list)): + y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in))))) + x_in = self.relu(y + x_in) # modified + + x_in = self.conv_classes(x_in) + + return x_in + + +class ProcessKitti(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]): + super(Process, self).__init__() + self.main = nn.Sequential( + *[ + Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + norm_layer=norm_layer, + dilation=[i, i, i], + ) + for i in dilations + ] + ) + + def forward(self, x): + return self.main(x) + + +class Process(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]): + super(Process, self).__init__() + self.main = nn.Sequential( + *[ + Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + norm_layer=norm_layer, + dilation=[i, i, i], + ) + for i in dilations + ] + ) + + def forward(self, x): + return self.main(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, out_channels, norm_layer, bn_momentum): + super(Upsample, self).__init__() + self.main = nn.Sequential( + nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + dilation=1, + output_padding=1, + ), + norm_layer(out_channels, momentum=bn_momentum), + nn.ReLU(), + ) + + def forward(self, x): + return self.main(x) + + +class Downsample(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, expansion=8): + super(Downsample, self).__init__() + self.main = Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + expansion=expansion, + stride=2, + downsample=nn.Sequential( + nn.AvgPool3d(kernel_size=2, stride=2), + nn.Conv3d( + feature, + int(feature * expansion / 4), + kernel_size=1, + stride=1, + bias=False, + ), + norm_layer(int(feature * expansion / 4), momentum=bn_momentum), + ), + norm_layer=norm_layer, + ) + + def forward(self, x): + return self.main(x) diff --git a/monoscene/models/monoscene.py b/monoscene/models/monoscene.py new file mode 100644 index 0000000..d85730d --- /dev/null +++ b/monoscene/models/monoscene.py @@ -0,0 +1,290 @@ +import pytorch_lightning as pl +import torch +import torch.nn as nn +from monoscene.models.unet3d_nyu import UNet3D as UNet3DNYU +from monoscene.models.unet3d_kitti import UNet3D as UNet3DKitti +from monoscene.loss.sscMetrics import SSCMetrics +from monoscene.loss.ssc_loss import sem_scal_loss, CE_ssc_loss, KL_sep, geo_scal_loss +from monoscene.models.flosp import FLoSP +from monoscene.loss.CRP_loss import compute_super_CP_multilabel_loss +import numpy as np +import torch.nn.functional as F +from monoscene.models.unet2d import UNet2D +from torch.optim.lr_scheduler import MultiStepLR + + +class MonoScene(pl.LightningModule): + def __init__( + self, + n_classes, + class_names, + feature, + class_weights, + project_scale, + full_scene_size, + dataset, + n_relations=4, + context_prior=True, + fp_loss=True, + project_res=[], + frustum_size=4, + relation_loss=False, + CE_ssc_loss=True, + geo_scal_loss=True, + sem_scal_loss=True, + lr=1e-4, + weight_decay=1e-4, + ): + super().__init__() + + self.project_res = project_res + self.fp_loss = fp_loss + self.dataset = dataset + self.context_prior = context_prior + self.frustum_size = frustum_size + self.class_names = class_names + self.relation_loss = relation_loss + self.CE_ssc_loss = CE_ssc_loss + self.sem_scal_loss = sem_scal_loss + self.geo_scal_loss = geo_scal_loss + self.project_scale = project_scale + self.class_weights = class_weights + self.lr = lr + self.weight_decay = weight_decay + + self.projects = {} + self.scale_2ds = [1, 2, 4, 8] # 2D scales + for scale_2d in self.scale_2ds: + self.projects[str(scale_2d)] = FLoSP( + full_scene_size, project_scale=self.project_scale, dataset=self.dataset + ) + self.projects = nn.ModuleDict(self.projects) + + self.n_classes = n_classes + if self.dataset == "NYU": + self.net_3d_decoder = UNet3DNYU( + self.n_classes, + nn.BatchNorm3d, + n_relations=n_relations, + feature=feature, + full_scene_size=full_scene_size, + context_prior=context_prior, + ) + elif self.dataset == "kitti": + self.net_3d_decoder = UNet3DKitti( + self.n_classes, + nn.BatchNorm3d, + project_scale=project_scale, + feature=feature, + full_scene_size=full_scene_size, + context_prior=context_prior, + ) + self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True) + + # log hyperparameters + self.save_hyperparameters() + + self.train_metrics = SSCMetrics(self.n_classes) + self.val_metrics = SSCMetrics(self.n_classes) + self.test_metrics = SSCMetrics(self.n_classes) + + def forward(self, batch): + + img = batch["img"] + bs = len(img) + + out = {} + + x_rgb = self.net_rgb(img) + + x3ds = [] + for i in range(bs): + x3d = None + for scale_2d in self.project_res: + + # project features at each 2D scale to target 3D scale + scale_2d = int(scale_2d) + projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda() + fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda() + + # Sum all the 3D features + if x3d is None: + x3d = self.projects[str(scale_2d)]( + x_rgb["1_" + str(scale_2d)][i], + projected_pix // scale_2d, + fov_mask, + ) + else: + x3d += self.projects[str(scale_2d)]( + x_rgb["1_" + str(scale_2d)][i], + projected_pix // scale_2d, + fov_mask, + ) + x3ds.append(x3d) + + input_dict = { + "x3d": torch.stack(x3ds), + } + + out = self.net_3d_decoder(input_dict) + + return out + + def step(self, batch, step_type, metric): + bs = len(batch["img"]) + loss = 0 + out_dict = self(batch) + ssc_pred = out_dict["ssc_logit"] + target = batch["target"] + + if self.context_prior: + P_logits = out_dict["P_logits"] + CP_mega_matrices = batch["CP_mega_matrices"] + + if self.relation_loss: + loss_rel_ce = compute_super_CP_multilabel_loss( + P_logits, CP_mega_matrices + ) + loss += loss_rel_ce + self.log( + step_type + "/loss_relation_ce_super", + loss_rel_ce.detach(), + on_epoch=True, + sync_dist=True, + ) + + class_weight = self.class_weights.type_as(batch["img"]) + if self.CE_ssc_loss: + loss_ssc = CE_ssc_loss(ssc_pred, target, class_weight) + loss += loss_ssc + self.log( + step_type + "/loss_ssc", + loss_ssc.detach(), + on_epoch=True, + sync_dist=True, + ) + + if self.sem_scal_loss: + loss_sem_scal = sem_scal_loss(ssc_pred, target) + loss += loss_sem_scal + self.log( + step_type + "/loss_sem_scal", + loss_sem_scal.detach(), + on_epoch=True, + sync_dist=True, + ) + + if self.geo_scal_loss: + loss_geo_scal = geo_scal_loss(ssc_pred, target) + loss += loss_geo_scal + self.log( + step_type + "/loss_geo_scal", + loss_geo_scal.detach(), + on_epoch=True, + sync_dist=True, + ) + + if self.fp_loss and step_type != "test": + frustums_masks = torch.stack(batch["frustums_masks"]) + frustums_class_dists = torch.stack( + batch["frustums_class_dists"] + ).float() # (bs, n_frustums, n_classes) + n_frustums = frustums_class_dists.shape[1] + + pred_prob = F.softmax(ssc_pred, dim=1) + batch_cnt = frustums_class_dists.sum(0) # (n_frustums, n_classes) + + frustum_loss = 0 + frustum_nonempty = 0 + for frus in range(n_frustums): + frustum_mask = frustums_masks[:, frus, :, :, :].unsqueeze(1).float() + prob = frustum_mask * pred_prob # bs, n_classes, H, W, D + prob = prob.reshape(bs, self.n_classes, -1).permute(1, 0, 2) + prob = prob.reshape(self.n_classes, -1) + cum_prob = prob.sum(dim=1) # n_classes + + total_cnt = torch.sum(batch_cnt[frus]) + total_prob = prob.sum() + if total_prob > 0 and total_cnt > 0: + frustum_target_proportion = batch_cnt[frus] / total_cnt + cum_prob = cum_prob / total_prob # n_classes + frustum_loss_i = KL_sep(cum_prob, frustum_target_proportion) + frustum_loss += frustum_loss_i + frustum_nonempty += 1 + frustum_loss = frustum_loss / frustum_nonempty + loss += frustum_loss + self.log( + step_type + "/loss_frustums", + frustum_loss.detach(), + on_epoch=True, + sync_dist=True, + ) + + y_true = target.cpu().numpy() + y_pred = ssc_pred.detach().cpu().numpy() + y_pred = np.argmax(y_pred, axis=1) + metric.add_batch(y_pred, y_true) + + self.log(step_type + "/loss", loss.detach(), on_epoch=True, sync_dist=True) + + return loss + + def training_step(self, batch, batch_idx): + return self.step(batch, "train", self.train_metrics) + + def validation_step(self, batch, batch_idx): + self.step(batch, "val", self.val_metrics) + + def validation_epoch_end(self, outputs): + metric_list = [("train", self.train_metrics), ("val", self.val_metrics)] + + for prefix, metric in metric_list: + stats = metric.get_stats() + for i, class_name in enumerate(self.class_names): + self.log( + "{}_SemIoU/{}".format(prefix, class_name), + stats["iou_ssc"][i], + sync_dist=True, + ) + self.log("{}/mIoU".format(prefix), stats["iou_ssc_mean"], sync_dist=True) + self.log("{}/IoU".format(prefix), stats["iou"], sync_dist=True) + self.log("{}/Precision".format(prefix), stats["precision"], sync_dist=True) + self.log("{}/Recall".format(prefix), stats["recall"], sync_dist=True) + metric.reset() + + def test_step(self, batch, batch_idx): + self.step(batch, "test", self.test_metrics) + + def test_epoch_end(self, outputs): + classes = self.class_names + metric_list = [("test", self.test_metrics)] + for prefix, metric in metric_list: + print("{}======".format(prefix)) + stats = metric.get_stats() + print( + "Precision={:.4f}, Recall={:.4f}, IoU={:.4f}".format( + stats["precision"] * 100, stats["recall"] * 100, stats["iou"] * 100 + ) + ) + print("class IoU: {}, ".format(classes)) + print( + " ".join(["{:.4f}, "] * len(classes)).format( + *(stats["iou_ssc"] * 100).tolist() + ) + ) + print("mIoU={:.4f}".format(stats["iou_ssc_mean"] * 100)) + metric.reset() + + def configure_optimizers(self): + if self.dataset == "NYU": + optimizer = torch.optim.AdamW( + self.parameters(), lr=self.lr, weight_decay=self.weight_decay + ) + scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1) + return [optimizer], [scheduler] + elif self.dataset == "kitti": + optimizer = torch.optim.AdamW( + self.parameters(), lr=self.lr, weight_decay=self.weight_decay + ) + scheduler = MultiStepLR(optimizer, milestones=[20], gamma=0.1) + return [optimizer], [scheduler] diff --git a/monoscene/models/unet2d.py b/monoscene/models/unet2d.py new file mode 100644 index 0000000..34cd7c8 --- /dev/null +++ b/monoscene/models/unet2d.py @@ -0,0 +1,194 @@ +""" +Code adapted from https://github.com/shariqfarooq123/AdaBins/blob/main/models/unet_adaptive_bins.py +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class UpSampleBN(nn.Module): + def __init__(self, skip_input, output_features): + super(UpSampleBN, self).__init__() + self._net = nn.Sequential( + nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(output_features), + nn.LeakyReLU(), + nn.Conv2d( + output_features, output_features, kernel_size=3, stride=1, padding=1 + ), + nn.BatchNorm2d(output_features), + nn.LeakyReLU(), + ) + + def forward(self, x, concat_with): + up_x = F.interpolate( + x, + size=(concat_with.shape[2], concat_with.shape[3]), + mode="bilinear", + align_corners=True, + ) + f = torch.cat([up_x, concat_with], dim=1) + return self._net(f) + + +class DecoderBN(nn.Module): + def __init__( + self, num_features, bottleneck_features, out_feature, use_decoder=True + ): + super(DecoderBN, self).__init__() + features = int(num_features) + self.use_decoder = use_decoder + + self.conv2 = nn.Conv2d( + bottleneck_features, features, kernel_size=1, stride=1, padding=1 + ) + + self.out_feature_1_1 = out_feature + self.out_feature_1_2 = out_feature + self.out_feature_1_4 = out_feature + self.out_feature_1_8 = out_feature + self.out_feature_1_16 = out_feature + self.feature_1_16 = features // 2 + self.feature_1_8 = features // 4 + self.feature_1_4 = features // 8 + self.feature_1_2 = features // 16 + self.feature_1_1 = features // 32 + + if self.use_decoder: + self.resize_output_1_1 = nn.Conv2d( + self.feature_1_1, self.out_feature_1_1, kernel_size=1 + ) + self.resize_output_1_2 = nn.Conv2d( + self.feature_1_2, self.out_feature_1_2, kernel_size=1 + ) + self.resize_output_1_4 = nn.Conv2d( + self.feature_1_4, self.out_feature_1_4, kernel_size=1 + ) + self.resize_output_1_8 = nn.Conv2d( + self.feature_1_8, self.out_feature_1_8, kernel_size=1 + ) + self.resize_output_1_16 = nn.Conv2d( + self.feature_1_16, self.out_feature_1_16, kernel_size=1 + ) + + self.up16 = UpSampleBN( + skip_input=features + 224, output_features=self.feature_1_16 + ) + self.up8 = UpSampleBN( + skip_input=self.feature_1_16 + 80, output_features=self.feature_1_8 + ) + self.up4 = UpSampleBN( + skip_input=self.feature_1_8 + 48, output_features=self.feature_1_4 + ) + self.up2 = UpSampleBN( + skip_input=self.feature_1_4 + 32, output_features=self.feature_1_2 + ) + self.up1 = UpSampleBN( + skip_input=self.feature_1_2 + 3, output_features=self.feature_1_1 + ) + else: + self.resize_output_1_1 = nn.Conv2d(3, out_feature, kernel_size=1) + self.resize_output_1_2 = nn.Conv2d(32, out_feature * 2, kernel_size=1) + self.resize_output_1_4 = nn.Conv2d(48, out_feature * 4, kernel_size=1) + + def forward(self, features): + x_block0, x_block1, x_block2, x_block3, x_block4 = ( + features[4], + features[5], + features[6], + features[8], + features[11], + ) + bs = x_block0.shape[0] + x_d0 = self.conv2(x_block4) + + if self.use_decoder: + x_1_16 = self.up16(x_d0, x_block3) + x_1_8 = self.up8(x_1_16, x_block2) + x_1_4 = self.up4(x_1_8, x_block1) + x_1_2 = self.up2(x_1_4, x_block0) + x_1_1 = self.up1(x_1_2, features[0]) + return { + "1_1": self.resize_output_1_1(x_1_1), + "1_2": self.resize_output_1_2(x_1_2), + "1_4": self.resize_output_1_4(x_1_4), + "1_8": self.resize_output_1_8(x_1_8), + "1_16": self.resize_output_1_16(x_1_16), + } + else: + x_1_1 = features[0] + x_1_2, x_1_4, x_1_8, x_1_16 = ( + features[4], + features[5], + features[6], + features[8], + ) + x_global = features[-1].reshape(bs, 2560, -1).mean(2) + return { + "1_1": self.resize_output_1_1(x_1_1), + "1_2": self.resize_output_1_2(x_1_2), + "1_4": self.resize_output_1_4(x_1_4), + "global": x_global, + } + + +class Encoder(nn.Module): + def __init__(self, backend): + super(Encoder, self).__init__() + self.original_model = backend + + def forward(self, x): + features = [x] + for k, v in self.original_model._modules.items(): + if k == "blocks": + for ki, vi in v._modules.items(): + features.append(vi(features[-1])) + else: + features.append(v(features[-1])) + return features + + +class UNet2D(nn.Module): + def __init__(self, backend, num_features, out_feature, use_decoder=True): + super(UNet2D, self).__init__() + self.use_decoder = use_decoder + self.encoder = Encoder(backend) + self.decoder = DecoderBN( + out_feature=out_feature, + use_decoder=use_decoder, + bottleneck_features=num_features, + num_features=num_features, + ) + + def forward(self, x, **kwargs): + encoded_feats = self.encoder(x) + unet_out = self.decoder(encoded_feats, **kwargs) + return unet_out + + def get_encoder_params(self): # lr/10 learning rate + return self.encoder.parameters() + + def get_decoder_params(self): # lr learning rate + return self.decoder.parameters() + + @classmethod + def build(cls, **kwargs): + basemodel_name = "tf_efficientnet_b7_ns" + num_features = 2560 + + print("Loading base model ()...".format(basemodel_name), end="") + basemodel = torch.hub.load( + "rwightman/gen-efficientnet-pytorch", basemodel_name, pretrained=True + ) + print("Done.") + + # Remove last layer + print("Removing last two layers (global_pool & classifier).") + basemodel.global_pool = nn.Identity() + basemodel.classifier = nn.Identity() + + # Building Encoder-Decoder model + print("Building Encoder-Decoder model..", end="") + m = cls(basemodel, num_features=num_features, **kwargs) + print("Done.") + return m diff --git a/monoscene/models/unet3d_kitti.py b/monoscene/models/unet3d_kitti.py new file mode 100644 index 0000000..a923cbb --- /dev/null +++ b/monoscene/models/unet3d_kitti.py @@ -0,0 +1,88 @@ +# encoding: utf-8 +import torch +import torch.nn as nn +import torch.nn.functional as F +from monoscene.models.modules import SegmentationHead +from monoscene.models.CRP3D import CPMegaVoxels +from monoscene.models.modules import Process, Upsample, Downsample + + +class UNet3D(nn.Module): + def __init__( + self, + class_num, + norm_layer, + full_scene_size, + feature, + project_scale, + context_prior=None, + bn_momentum=0.1, + ): + super(UNet3D, self).__init__() + self.business_layer = [] + self.project_scale = project_scale + self.full_scene_size = full_scene_size + self.feature = feature + + size_l1 = ( + int(self.full_scene_size[0] / project_scale), + int(self.full_scene_size[1] / project_scale), + int(self.full_scene_size[2] / project_scale), + ) + size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2) + size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2) + + dilations = [1, 2, 3] + self.process_l1 = nn.Sequential( + Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature, norm_layer, bn_momentum), + ) + self.process_l2 = nn.Sequential( + Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature * 2, norm_layer, bn_momentum), + ) + + self.up_13_l2 = Upsample( + self.feature * 4, self.feature * 2, norm_layer, bn_momentum + ) + self.up_12_l1 = Upsample( + self.feature * 2, self.feature, norm_layer, bn_momentum + ) + self.up_l1_lfull = Upsample( + self.feature, self.feature // 2, norm_layer, bn_momentum + ) + + self.ssc_head = SegmentationHead( + self.feature // 2, self.feature // 2, class_num, dilations + ) + + self.context_prior = context_prior + if context_prior: + self.CP_mega_voxels = CPMegaVoxels( + self.feature * 4, self.feature * 4, size_l3, bn_momentum=bn_momentum + ) + + def forward(self, input_dict): + res = {} + + x3d_l1 = input_dict["x3d"] + + x3d_l2 = self.process_l1(x3d_l1) + + x3d_l3 = self.process_l2(x3d_l2) + + if self.context_prior: + ret = self.CP_mega_voxels(x3d_l3) + x3d_l3 = ret["x"] + for k in ret.keys(): + res[k] = ret[k] + + x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2 + x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1 + x3d_up_lfull = self.up_l1_lfull(x3d_up_l1) + + ssc_logit_full = self.ssc_head(x3d_up_lfull) + + res["ssc_logit"] = ssc_logit_full + + return res diff --git a/monoscene/models/unet3d_nyu.py b/monoscene/models/unet3d_nyu.py new file mode 100644 index 0000000..a052f2f --- /dev/null +++ b/monoscene/models/unet3d_nyu.py @@ -0,0 +1,91 @@ +# encoding: utf-8 +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from monoscene.models.CRP3D import CPMegaVoxels +from monoscene.models.modules import ( + Process, + Upsample, + Downsample, + SegmentationHead, + ASPP, +) + + +class UNet3D(nn.Module): + def __init__( + self, + class_num, + norm_layer, + feature, + full_scene_size, + n_relations=4, + project_res=[], + context_prior=True, + bn_momentum=0.1, + ): + super(UNet3D, self).__init__() + self.business_layer = [] + self.project_res = project_res + + self.feature_1_4 = feature + self.feature_1_8 = feature * 2 + self.feature_1_16 = feature * 4 + + self.feature_1_16_dec = self.feature_1_16 + self.feature_1_8_dec = self.feature_1_8 + self.feature_1_4_dec = self.feature_1_4 + + self.process_1_4 = nn.Sequential( + Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature_1_4, norm_layer, bn_momentum), + ) + self.process_1_8 = nn.Sequential( + Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature_1_8, norm_layer, bn_momentum), + ) + self.up_1_16_1_8 = Upsample( + self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum + ) + self.up_1_8_1_4 = Upsample( + self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum + ) + self.ssc_head_1_4 = SegmentationHead( + self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3] + ) + + self.context_prior = context_prior + size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size) + + if context_prior: + self.CP_mega_voxels = CPMegaVoxels( + self.feature_1_16, + self.feature_1_16, + size_1_16, + n_relations=n_relations, + bn_momentum=bn_momentum, + ) + + # + def forward(self, input_dict): + res = {} + + x3d_1_4 = input_dict["x3d"] + x3d_1_8 = self.process_1_4(x3d_1_4) + x3d_1_16 = self.process_1_8(x3d_1_8) + + if self.context_prior: + ret = self.CP_mega_voxels(x3d_1_16) + x3d_1_16 = ret["x"] + for k in ret.keys(): + res[k] = ret[k] + + x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8 + x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4 + + ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4) + + res["ssc_logit"] = ssc_logit_1_4 + + return res diff --git a/monoscene/scripts/eval_monoscene.py b/monoscene/scripts/eval_monoscene.py new file mode 100644 index 0000000..155b4e0 --- /dev/null +++ b/monoscene/scripts/eval_monoscene.py @@ -0,0 +1,71 @@ +from pytorch_lightning import Trainer +from monoscene.models.monoscene import MonoScene +from monoscene.data.NYU.nyu_dm import NYUDataModule +from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule +import hydra +from omegaconf import DictConfig +import torch +import os +from hydra.utils import get_original_cwd + + +@hydra.main(config_name="../config/monoscene.yaml") +def main(config: DictConfig): + torch.set_grad_enabled(False) + if config.dataset == "kitti": + config.batch_size = 1 + n_classes = 20 + feature = 64 + project_scale = 2 + full_scene_size = (256, 256, 32) + data_module = KittiDataModule( + root=config.kitti_root, + preprocess_root=config.kitti_preprocess_root, + frustum_size=config.frustum_size, + batch_size=int(config.batch_size / config.n_gpus), + num_workers=int(config.num_workers_per_gpu * config.n_gpus), + ) + + elif config.dataset == "NYU": + config.batch_size = 2 + project_scale = 1 + n_classes = 12 + feature = 200 + full_scene_size = (60, 36, 60) + data_module = NYUDataModule( + root=config.NYU_root, + preprocess_root=config.NYU_preprocess_root, + n_relations=config.n_relations, + frustum_size=config.frustum_size, + batch_size=int(config.batch_size / config.n_gpus), + num_workers=int(config.num_workers_per_gpu * config.n_gpus), + ) + + trainer = Trainer( + sync_batchnorm=True, deterministic=True, gpus=config.n_gpus, accelerator="ddp" + ) + + if config.dataset == "NYU": + model_path = os.path.join( + get_original_cwd(), "trained_models", "monoscene_nyu.ckpt" + ) + else: + model_path = os.path.join( + get_original_cwd(), "trained_models", "monoscene_kitti.ckpt" + ) + + model = MonoScene.load_from_checkpoint( + model_path, + feature=feature, + project_scale=project_scale, + fp_loss=config.fp_loss, + full_scene_size=full_scene_size, + ) + model.eval() + data_module.setup() + val_dataloader = data_module.val_dataloader() + trainer.test(model, test_dataloaders=val_dataloader) + + +if __name__ == "__main__": + main() diff --git a/monoscene/scripts/generate_output.py b/monoscene/scripts/generate_output.py new file mode 100644 index 0000000..892f8cd --- /dev/null +++ b/monoscene/scripts/generate_output.py @@ -0,0 +1,127 @@ +from pytorch_lightning import Trainer +from monoscene.models.monoscene import MonoScene +from monoscene.data.NYU.nyu_dm import NYUDataModule +from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule +from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule +import hydra +from omegaconf import DictConfig +import torch +import numpy as np +import os +from hydra.utils import get_original_cwd +from tqdm import tqdm +import pickle + + +@hydra.main(config_name="../config/monoscene.yaml") +def main(config: DictConfig): + torch.set_grad_enabled(False) + + # Setup dataloader + if config.dataset == "kitti" or config.dataset == "kitti_360": + feature = 64 + project_scale = 2 + full_scene_size = (256, 256, 32) + + if config.dataset == "kitti": + data_module = KittiDataModule( + root=config.kitti_root, + preprocess_root=config.kitti_preprocess_root, + frustum_size=config.frustum_size, + batch_size=int(config.batch_size / config.n_gpus), + num_workers=int(config.num_workers_per_gpu * config.n_gpus), + ) + data_module.setup() + data_loader = data_module.val_dataloader() + # data_loader = data_module.test_dataloader() # use this if you want to infer on test set + else: + data_module = Kitti360DataModule( + root=config.kitti_360_root, + sequences=[config.kitti_360_sequence], + n_scans=2000, + batch_size=1, + num_workers=3, + ) + data_module.setup() + data_loader = data_module.dataloader() + + elif config.dataset == "NYU": + project_scale = 1 + feature = 200 + full_scene_size = (60, 36, 60) + data_module = NYUDataModule( + root=config.NYU_root, + preprocess_root=config.NYU_preprocess_root, + n_relations=config.n_relations, + frustum_size=config.frustum_size, + batch_size=int(config.batch_size / config.n_gpus), + num_workers=int(config.num_workers_per_gpu * config.n_gpus), + ) + data_module.setup() + data_loader = data_module.val_dataloader() + # data_loader = data_module.test_dataloader() # use this if you want to infer on test set + else: + print("dataset not support") + + # Load pretrained models + if config.dataset == "NYU": + model_path = os.path.join( + get_original_cwd(), "trained_models", "monoscene_nyu.ckpt" + ) + else: + model_path = os.path.join( + get_original_cwd(), "trained_models", "monoscene_kitti.ckpt" + ) + + model = MonoScene.load_from_checkpoint( + model_path, + feature=feature, + project_scale=project_scale, + fp_loss=config.fp_loss, + full_scene_size=full_scene_size, + ) + model.cuda() + model.eval() + + # Save prediction and additional data + # to draw the viewing frustum and remove scene outside the room for NYUv2 + output_path = os.path.join(config.output_path, config.dataset) + with torch.no_grad(): + for batch in tqdm(data_loader): + batch["img"] = batch["img"].cuda() + pred = model(batch) + y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy() + y_pred = np.argmax(y_pred, axis=1) + for i in range(config.batch_size): + out_dict = {"y_pred": y_pred[i].astype(np.uint16)} + if "target" in batch: + out_dict["target"] = ( + batch["target"][i].detach().cpu().numpy().astype(np.uint16) + ) + + if config.dataset == "NYU": + write_path = output_path + filepath = os.path.join(write_path, batch["name"][i] + ".pkl") + out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy() + out_dict["vox_origin"] = ( + batch["vox_origin"][i].detach().cpu().numpy() + ) + else: + write_path = os.path.join(output_path, batch["sequence"][i]) + filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl") + out_dict["fov_mask_1"] = ( + batch["fov_mask_1"][i].detach().cpu().numpy() + ) + out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy() + out_dict["T_velo_2_cam"] = ( + batch["T_velo_2_cam"][i].detach().cpu().numpy() + ) + + os.makedirs(write_path, exist_ok=True) + with open(filepath, "wb") as handle: + pickle.dump(out_dict, handle) + print("wrote to", filepath) + + +if __name__ == "__main__": + main() diff --git a/monoscene/scripts/train_monoscene.py b/monoscene/scripts/train_monoscene.py new file mode 100644 index 0000000..8c58d6a --- /dev/null +++ b/monoscene/scripts/train_monoscene.py @@ -0,0 +1,173 @@ +from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule +from monoscene.data.semantic_kitti.params import ( + semantic_kitti_class_frequencies, + kitti_class_names, +) +from monoscene.data.NYU.params import ( + class_weights as NYU_class_weights, + NYU_class_names, +) +from monoscene.data.NYU.nyu_dm import NYUDataModule +from torch.utils.data.dataloader import DataLoader +from monoscene.models.monoscene import MonoScene +from pytorch_lightning import Trainer +from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor +import os +import hydra +from omegaconf import DictConfig +import numpy as np +import torch + +hydra.output_subdir = None + + +@hydra.main(config_name="../config/monoscene.yaml") +def main(config: DictConfig): + exp_name = config.exp_prefix + exp_name += "_{}_{}".format(config.dataset, config.run) + exp_name += "_FrusSize_{}".format(config.frustum_size) + exp_name += "_nRelations{}".format(config.n_relations) + exp_name += "_WD{}_lr{}".format(config.weight_decay, config.lr) + + if config.CE_ssc_loss: + exp_name += "_CEssc" + if config.geo_scal_loss: + exp_name += "_geoScalLoss" + if config.sem_scal_loss: + exp_name += "_semScalLoss" + if config.fp_loss: + exp_name += "_fpLoss" + + if config.relation_loss: + exp_name += "_CERel" + if config.context_prior: + exp_name += "_3DCRP" + + # Setup dataloaders + if config.dataset == "kitti": + class_names = kitti_class_names + max_epochs = 30 + logdir = config.kitti_logdir + full_scene_size = (256, 256, 32) + project_scale = 2 + feature = 64 + n_classes = 20 + class_weights = torch.from_numpy( + 1 / np.log(semantic_kitti_class_frequencies + 0.001) + ) + data_module = KittiDataModule( + root=config.kitti_root, + preprocess_root=config.kitti_preprocess_root, + frustum_size=config.frustum_size, + project_scale=project_scale, + batch_size=int(config.batch_size / config.n_gpus), + num_workers=int(config.num_workers_per_gpu * config.n_gpus), + ) + + elif config.dataset == "NYU": + class_names = NYU_class_names + max_epochs = 30 + logdir = config.logdir + full_scene_size = (60, 36, 60) + project_scale = 1 + feature = 200 + n_classes = 12 + class_weights = NYU_class_weights + data_module = NYUDataModule( + root=config.NYU_root, + preprocess_root=config.NYU_preprocess_root, + n_relations=config.n_relations, + frustum_size=config.frustum_size, + batch_size=int(config.batch_size / config.n_gpus), + num_workers=int(config.num_workers_per_gpu * config.n_gpus), + ) + + project_res = ["1"] + if config.project_1_2: + exp_name += "_Proj_2" + project_res.append("2") + if config.project_1_4: + exp_name += "_4" + project_res.append("4") + if config.project_1_8: + exp_name += "_8" + project_res.append("8") + + print(exp_name) + + # Initialize MonoScene model + model = MonoScene( + dataset=config.dataset, + frustum_size=config.frustum_size, + project_scale=project_scale, + n_relations=config.n_relations, + fp_loss=config.fp_loss, + feature=feature, + full_scene_size=full_scene_size, + project_res=project_res, + n_classes=n_classes, + class_names=class_names, + context_prior=config.context_prior, + relation_loss=config.relation_loss, + CE_ssc_loss=config.CE_ssc_loss, + sem_scal_loss=config.sem_scal_loss, + geo_scal_loss=config.geo_scal_loss, + lr=config.lr, + weight_decay=config.weight_decay, + class_weights=class_weights, + ) + + if config.enable_log: + logger = TensorBoardLogger(save_dir=logdir, name=exp_name, version="") + lr_monitor = LearningRateMonitor(logging_interval="step") + checkpoint_callbacks = [ + ModelCheckpoint( + save_last=True, + monitor="val/mIoU", + save_top_k=1, + mode="max", + filename="{epoch:03d}-{val/mIoU:.5f}", + ), + lr_monitor, + ] + else: + logger = False + checkpoint_callbacks = False + + model_path = os.path.join(logdir, exp_name, "checkpoints/last.ckpt") + if os.path.isfile(model_path): + # Continue training from last.ckpt + trainer = Trainer( + callbacks=checkpoint_callbacks, + resume_from_checkpoint=model_path, + sync_batchnorm=True, + deterministic=True, + max_epochs=max_epochs, + gpus=config.n_gpus, + logger=logger, + check_val_every_n_epoch=1, + log_every_n_steps=10, + flush_logs_every_n_steps=100, + accelerator="ddp", + ) + else: + # Train from scratch + trainer = Trainer( + callbacks=checkpoint_callbacks, + sync_batchnorm=True, + deterministic=True, + max_epochs=max_epochs, + gpus=config.n_gpus, + logger=logger, + check_val_every_n_epoch=1, + log_every_n_steps=10, + flush_logs_every_n_steps=100, + accelerator="ddp", + ) + + trainer.fit(model, data_module) + + +if __name__ == "__main__": + main() diff --git a/monoscene/scripts/visualization/NYU_vis_pred.py b/monoscene/scripts/visualization/NYU_vis_pred.py new file mode 100644 index 0000000..d451983 --- /dev/null +++ b/monoscene/scripts/visualization/NYU_vis_pred.py @@ -0,0 +1,156 @@ +import pickle +import os +from omegaconf import DictConfig +import numpy as np +import hydra +from mayavi import mlab + + +def get_grid_coords(dims, resolution): + """ + :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32]) + :return coords_grid: is the center coords of voxels in the grid + """ + + g_xx = np.arange(0, dims[0] + 1) + g_yy = np.arange(0, dims[1] + 1) + + g_zz = np.arange(0, dims[2] + 1) + + # Obtaining the grid with coords... + xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1]) + coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T + coords_grid = coords_grid.astype(np.float) + + coords_grid = (coords_grid * resolution) + resolution / 2 + + temp = np.copy(coords_grid) + temp[:, 0] = coords_grid[:, 1] + temp[:, 1] = coords_grid[:, 0] + coords_grid = np.copy(temp) + + return coords_grid + + +def draw( + voxels, + cam_pose, + vox_origin, + voxel_size=0.08, + d=0.75, # 0.75m - determine the size of the mesh representing the camera +): + # Compute the coordinates of the mesh representing camera + y = d * 480 / (2 * 518.8579) + x = d * 640 / (2 * 518.8579) + tri_points = np.array( + [ + [0, 0, 0], + [x, y, d], + [-x, y, d], + [-x, -y, d], + [x, -y, d], + ] + ) + tri_points = np.hstack([tri_points, np.ones((5, 1))]) + + tri_points = (cam_pose @ tri_points.T).T + x = tri_points[:, 0] - vox_origin[0] + y = tri_points[:, 1] - vox_origin[1] + z = tri_points[:, 2] - vox_origin[2] + triangles = [ + (0, 1, 2), + (0, 1, 4), + (0, 3, 4), + (0, 2, 3), + ] + + # Compute the voxels coordinates + grid_coords = get_grid_coords( + [voxels.shape[0], voxels.shape[2], voxels.shape[1]], voxel_size + ) + + # Attach the predicted class to every voxel + grid_coords = np.vstack( + (grid_coords.T, np.moveaxis(voxels, [0, 1, 2], [0, 2, 1]).reshape(-1)) + ).T + + # Remove empty and unknown voxels + occupied_voxels = grid_coords[(grid_coords[:, 3] > 0) & (grid_coords[:, 3] < 255)] + figure = mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1)) + + # Draw the camera + mlab.triangular_mesh( + x, + y, + z, + triangles, + representation="wireframe", + color=(0, 0, 0), + line_width=5, + ) + + # Draw occupied voxels + plt_plot = mlab.points3d( + occupied_voxels[:, 0], + occupied_voxels[:, 1], + occupied_voxels[:, 2], + occupied_voxels[:, 3], + colormap="viridis", + scale_factor=voxel_size - 0.1 * voxel_size, + mode="cube", + opacity=1.0, + vmin=0, + vmax=12, + ) + + colors = np.array( + [ + [22, 191, 206, 255], + [214, 38, 40, 255], + [43, 160, 43, 255], + [158, 216, 229, 255], + [114, 158, 206, 255], + [204, 204, 91, 255], + [255, 186, 119, 255], + [147, 102, 188, 255], + [30, 119, 181, 255], + [188, 188, 33, 255], + [255, 127, 12, 255], + [196, 175, 214, 255], + [153, 153, 153, 255], + ] + ) + + plt_plot.glyph.scale_mode = "scale_by_vector" + + plt_plot.module_manager.scalar_lut_manager.lut.table = colors + + mlab.show() + + +@hydra.main(config_path=None) +def main(config: DictConfig): + scan = config.file + + with open(scan, "rb") as handle: + b = pickle.load(handle) + + cam_pose = b["cam_pose"] + vox_origin = b["vox_origin"] + gt_scene = b["target"] + pred_scene = b["y_pred"] + scan = os.path.basename(scan)[:12] + + pred_scene[(gt_scene == 255)] = 255 # only draw scene inside the room + + draw( + pred_scene, + cam_pose, + vox_origin, + voxel_size=0.08, + d=0.75, + ) + + +if __name__ == "__main__": + main() diff --git a/monoscene/scripts/visualization/kitti_vis_pred.py b/monoscene/scripts/visualization/kitti_vis_pred.py new file mode 100644 index 0000000..0a34eb5 --- /dev/null +++ b/monoscene/scripts/visualization/kitti_vis_pred.py @@ -0,0 +1,201 @@ +# from operator import gt +import pickle +import numpy as np +from omegaconf import DictConfig +import hydra +from mayavi import mlab + + +def get_grid_coords(dims, resolution): + """ + :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32]) + :return coords_grid: is the center coords of voxels in the grid + """ + + g_xx = np.arange(0, dims[0] + 1) + g_yy = np.arange(0, dims[1] + 1) + sensor_pose = 10 + g_zz = np.arange(0, dims[2] + 1) + + # Obtaining the grid with coords... + xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1]) + coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T + coords_grid = coords_grid.astype(np.float) + + coords_grid = (coords_grid * resolution) + resolution / 2 + + temp = np.copy(coords_grid) + temp[:, 0] = coords_grid[:, 1] + temp[:, 1] = coords_grid[:, 0] + coords_grid = np.copy(temp) + + return coords_grid, g_xx, g_yy, g_zz + + +def draw( + voxels, + T_velo_2_cam, + vox_origin, + fov_mask, + img_size, + f, + voxel_size=0.2, + d=7, # 7m - determine the size of the mesh representing the camera +): + # Compute the coordinates of the mesh representing camera + x = d * img_size[0] / (2 * f) + y = d * img_size[1] / (2 * f) + tri_points = np.array( + [ + [0, 0, 0], + [x, y, d], + [-x, y, d], + [-x, -y, d], + [x, -y, d], + ] + ) + tri_points = np.hstack([tri_points, np.ones((5, 1))]) + tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T + x = tri_points[:, 0] - vox_origin[0] + y = tri_points[:, 1] - vox_origin[1] + z = tri_points[:, 2] - vox_origin[2] + triangles = [ + (0, 1, 2), + (0, 1, 4), + (0, 3, 4), + (0, 2, 3), + ] + + # Compute the voxels coordinates + grid_coords = get_grid_coords( + [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size + ) + + # Attach the predicted class to every voxel + grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T + + # Get the voxels inside FOV + fov_grid_coords = grid_coords[fov_mask, :] + + # Get the voxels outside FOV + outfov_grid_coords = grid_coords[~fov_mask, :] + + # Remove empty and unknown voxels + fov_voxels = fov_grid_coords[ + (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255) + ] + outfov_voxels = outfov_grid_coords[ + (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255) + ] + + figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1)) + + # Draw the camera + mlab.triangular_mesh( + x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5 + ) + + # Draw occupied inside FOV voxels + plt_plot_fov = mlab.points3d( + fov_voxels[:, 0], + fov_voxels[:, 1], + fov_voxels[:, 2], + fov_voxels[:, 3], + colormap="viridis", + scale_factor=voxel_size - 0.05 * voxel_size, + mode="cube", + opacity=1.0, + vmin=1, + vmax=19, + ) + + # Draw occupied outside FOV voxels + plt_plot_outfov = mlab.points3d( + outfov_voxels[:, 0], + outfov_voxels[:, 1], + outfov_voxels[:, 2], + outfov_voxels[:, 3], + colormap="viridis", + scale_factor=voxel_size - 0.05 * voxel_size, + mode="cube", + opacity=1.0, + vmin=1, + vmax=19, + ) + + colors = np.array( + [ + [100, 150, 245, 255], + [100, 230, 245, 255], + [30, 60, 150, 255], + [80, 30, 180, 255], + [100, 80, 250, 255], + [255, 30, 30, 255], + [255, 40, 200, 255], + [150, 30, 90, 255], + [255, 0, 255, 255], + [255, 150, 255, 255], + [75, 0, 75, 255], + [175, 0, 75, 255], + [255, 200, 0, 255], + [255, 120, 50, 255], + [0, 175, 0, 255], + [135, 60, 0, 255], + [150, 240, 80, 255], + [255, 240, 150, 255], + [255, 0, 0, 255], + ] + ).astype(np.uint8) + + plt_plot_fov.glyph.scale_mode = "scale_by_vector" + plt_plot_outfov.glyph.scale_mode = "scale_by_vector" + + plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors + + outfov_colors = colors + outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2 + plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors + + mlab.show() + + +@hydra.main(config_path=None) +def main(config: DictConfig): + scan = config.file + with open(scan, "rb") as handle: + b = pickle.load(handle) + + fov_mask_1 = b["fov_mask_1"] + T_velo_2_cam = b["T_velo_2_cam"] + vox_origin = np.array([0, -25.6, -2]) + + y_pred = b["y_pred"] + + if config.dataset == "kitti_360": + # Visualize KITTI-360 + draw( + y_pred, + T_velo_2_cam, + vox_origin, + fov_mask_1, + voxel_size=0.2, + f=552.55426, + img_size=(1408, 376), + d=7, + ) + else: + # Visualize Semantic KITTI + draw( + y_pred, + T_velo_2_cam, + vox_origin, + fov_mask_1, + img_size=(1220, 370), + f=707.0912, + voxel_size=0.2, + d=7, + ) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8cfd27f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +scikit-image==0.18.1 +PyYAML==5.3.1 +tqdm==4.49.0 +scikit-learn==0.24.0 +pytorch-lightning==1.4.0rc0 +opencv-python==4.5.1.48 +hydra-core==1.0.5 +numpy==1.20.3 +numba==0.54.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fb0f33c --- /dev/null +++ b/setup.py @@ -0,0 +1,6 @@ +from setuptools import setup +from setuptools import find_packages + +# for install, do: pip install -ve . + +setup(name='monoscene', packages=find_packages()) diff --git a/teaser/KITTI-360.gif b/teaser/KITTI-360.gif new file mode 100644 index 0000000..6e5f8ec Binary files /dev/null and b/teaser/KITTI-360.gif differ diff --git a/teaser/NYUv2.gif b/teaser/NYUv2.gif new file mode 100644 index 0000000..c460201 Binary files /dev/null and b/teaser/NYUv2.gif differ diff --git a/teaser/SemKITTI.gif b/teaser/SemKITTI.gif new file mode 100644 index 0000000..4501404 Binary files /dev/null and b/teaser/SemKITTI.gif differ diff --git a/trained_models/.placeholder b/trained_models/.placeholder new file mode 100644 index 0000000..e69de29