Skip to content


dataloader and run multi script for OCRTOC dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
rakeshshrestha31 committed Feb 18, 2022
1 parent 4717ee8 commit 57d8424
Show file tree
Hide file tree
Showing 3 changed files with 387 additions and 0 deletions.
1 change: 1 addition & 0 deletions core/dataset/
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
from loader_single import *
from loader_multi_pmodata import *
from loader_multi_realdata import *
from loader_multi_ocrtoc import *
from loader_color import *
124 changes: 124 additions & 0 deletions core/dataset/
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
This file is partly adapted from the original PMO repository
import numpy as np
import os, sys
import copy
import json
import torch
import trimesh
import open3d as o3d
import cv2

sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))
from common.geometry import Camera

class LoaderMultiOCRTOC(
def __init__(self, data_dir, class_num, scale=1, num_points=10000, focal=None):
self.class_num = class_num
self.data_dir = data_dir
self.seq_list = [
## chairs
# '1b30b6c7-b465-49d8-87e6-dd2314e53ad2',
# 'e5fc4a48-5120-48c7-9f72-b59f53a5c34e',
## sofas
# '55f7c741-d263-4049-bb8a-168d9eea1c77'
# '779cc50f-6bfe-41d9-8c27-7671bf77e450'

self.scale = scale
self.num_points = num_points
self.focal = focal

def __len__(self):
return len(self.seq_list)

def sample_points_from_ply(self, num_points_all, ply_fname):
pcd =
points = np.asarray(pcd.points, dtype=np.float32)
random_indices = np.random.choice(
range(points.shape[0]), num_points_all, replace=False
return points[random_indices, :]

def resize_image(self, image, scale):
h, w = image.shape[:2]
new = np.zeros(image.shape)
ns_h, ns_w = int(h*scale), int(w*scale)
if scale < 1:
new[int(h/2 -ns_h/2):int(h/2 + ns_h/2), int(w/2-ns_w/2):int(w/2 + ns_w/2)] = cv2.resize(image, (ns_h, ns_w))
new_img = cv2.resize(image, (ns_h, ns_w))
h_new, w_new = new_img.shape[:2]
new = new_img[int(h_new/2 - h/2):int(h_new/2 + h/2), int(w_new/2 - w/2):int(w_new/2 + w/2)]
return new

def __getitem__(self, idx):
instance_name = self.seq_list[idx]

cam = np.load(
os.path.join(self.data_dir, instance_name, 'dist_camera_data.npz'),

# rgb_dir = os.path.join(self.data_dir, instance_name, 'segmented_color')
rgb_dir = os.path.join(self.data_dir, instance_name, 'rgb_undistort')
mask_dir = os.path.join(self.data_dir, instance_name, 'mask')

img_list = []
mask_list = []
camera_list = []

for img_idx, (cam_id, extr) in enumerate(cam['extr'].item().items()):
if not 0 <= cam_id <= 53:

# rgba = cv2.imread(
# os.path.join(rgb_dir, f'segmented_color_{cam_id:03}.png'),
# ).astype(np.float32) / 255.0

# img_cur = rgba[..., :3]
# mask_cur = rgba[..., 3]

img_cur = cv2.imread(
os.path.join(rgb_dir, f'color_{cam_id:03}.png')
).astype(np.float32) / 255.0

mask_cur = cv2.imread(
os.path.join(mask_dir, f'mask_{cam_id:03}.png'),
).astype(np.float32) / 255.0

cam_cur = Camera(cam['intr'], extr)
if self.focal is not None:
img_cur = self.resize_image(img_cur, self.focal)
mask_cur = self.resize_image(mask_cur.astype(np.float), self.focal)
mask_cur[mask_cur<1] = 0
mask_cur = mask_cur.astype(np.bool)
cam_cur.intrinsic[0, 0] = cam_cur.intrinsic[0, 0]*self.focal
cam_cur.intrinsic[1, 1] = cam_cur.intrinsic[1, 1]*self.focal

if self.scale != 1:
mask_cur = cv2.resize(mask_cur.astype(np.float), None, fx=self.scale, fy=self.scale)
mask_cur[mask_cur<1] = 0
mask_cur = mask_cur.astype(np.bool)
img_cur = cv2.resize(img_cur, None, fx=self.scale, fy=self.scale)
cam_cur.intrinsic[:2] = cam_cur.intrinsic[:2] * self.scale
cam_cur.intrinsic[0, 2] = img_cur.shape[1] / 2.0
cam_cur.intrinsic[1, 2] = img_cur.shape[0] / 2.0


# get gt point cloud
ply_fname = os.path.join(
self.data_dir, instance_name, 'gt_labels_dist.ply'
points_gt = self.sample_points_from_ply(self.num_points, ply_fname)

return instance_name, img_list, mask_list, camera_list, points_gt
262 changes: 262 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
import numpy as np
import os, sys
import cv2
import torch
from tqdm import tqdm
import easydict
import open3d as o3d

from core.dataset import LoaderMultiOCRTOC
from core.utils.render_utils import *
from core.utils.decoder_utils import load_decoder
from core.visualize.visualizer import print_loss_pack_color, Visualizer
from core.visualize.vis_utils import *
from core.evaluation import *
from core.sdfrenderer import SDFRenderer_warp
from core.inv_optimizer import optimize_multi_view

LR = 1e-2

class_type = ['planes', 'chairs', 'cars', 'sofas']

def main():
import torch.multiprocessing as mp
import argparse
arg_parser = argparse.ArgumentParser(
description="Color training pipeline."
arg_parser.add_argument('-g', '--gpu', default='0', help='gpu id.')
arg_parser.add_argument("--checkpoint", "-c", dest="checkpoint", default="2000",
help='The checkpoint weights to use. This can be a number indicated an epoch or "latest" '
+ "for the latest weights (this is the default)",
arg_parser.add_argument('--test_step', '-t', type=int, default=5, help='test step.')
arg_parser.add_argument('--visualize', action='store_true', help='visualization flag.')
arg_parser.add_argument('--data_path', default='/3d-future/ocrtoc-rendered', help='path to PMO dataset.')
arg_parser.add_argument('--obj_name', default='sofas', help='deepsdf class model for experiments. (support "planes", "chairs", "cars"')
arg_parser.add_argument('--scale', type=float, default=0.0933, help='scale the size of input image, 224x224 -> 112x112.')
arg_parser.add_argument('--focal', type=float, default=None, help='resize the image and change focal length, try 2')
arg_parser.add_argument('--full', action='store_true', help='run over all PMO data, otherwise run demo')

args = arg_parser.parse_args()
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

num_sample_points = 10000
num_views_per_round = 8
sep_dist = 1
refine_sim = True

# load data
class_id = args.obj_name
exp_dir = os.path.join('deepsdf/experiments/', args.obj_name)

upper_loader = LoaderMultiOCRTOC(args.data_path, class_id, scale=args.scale, num_points=num_sample_points, focal=args.focal)
if args.full:
total_num_instance = 50 # consider 50 instances in total
out_dir = os.path.join('vis/multiview_syn/', args.obj_name)
total_num_instance = len(upper_loader) # demo data
out_dir = os.path.join('vis/demo_multiview_syn/', args.obj_name)

if not os.path.exists(out_dir):

cf_dist1_total = 0.0
cf_dist2_total = 0.0

for instance_num in range(total_num_instance):
vis_dir = os.path.join(out_dir, '{}'.format(instance_num))
if not os.path.exists(vis_dir):

instance_name, imgs, masks, cameras, points_gt = upper_loader[instance_num]

# visualize_3d(points_gt, cameras, args.obj_name)
# exit(0)

# RANDOMLY initialize shape code
latent_size = 256
std_ = 0.1
shape_code = torch.ones(1, latent_size).normal_(mean=0, std=std_)
shape_code = shape_code.float().cuda()
shape_code.requires_grad = True

decoder = load_decoder(exp_dir, args.checkpoint)
decoder = decoder.module.cuda()
optimizer_latent = torch.optim.Adam([shape_code], lr=LR)

img_h, img_w = imgs[0].shape[0], imgs[0].shape[1]
img_hw = (img_h, img_w)
print('Image size: {0}.'. format(img_hw))
sdf_renderer = SDFRenderer_warp(
decoder, cameras[0].intrinsic, march_step=200, buffer_size=1, threshold=THRESHOLD,
transform_matrix=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
evaluator = Evaluator(decoder)
visualizer = Visualizer(img_hw)

# with torch.no_grad():
# visualize_renderer(
# sdf_renderer, evaluator, shape_code, imgs, cameras,
# points_gt, args.obj_name
# )
# exit(0)

weight_list = {}
weight_list['color'] = 5.0
weight_list['l2reg'] = 1.0

shape_code, optimizer_latent = optimize_multi_view(sdf_renderer, evaluator, shape_code, optimizer_latent, imgs, cameras, weight_list, num_views_per_round=num_views_per_round, num_iters=50, sep_dist=sep_dist, num_sample_points=num_sample_points, visualizer=visualizer, points_gt=points_gt, vis_dir=vis_dir, vis_flag=args.visualize, full_flag=args.full)

if args.full:
# final evaluation
points_tmp = evaluator.latent_vec_to_points(shape_code, num_points=num_sample_points, silent=True)
dist1, dist2 = evaluator.compute_chamfer_distance(points_gt, points_tmp, separate=True)

cf_dist1_total += dist1 * 1000
cf_dist2_total += dist2 * 1000

if args.full:
print('Final Average Chamfer Loss: ', cf_dist1_total / total_num_instance, cf_dist2_total / total_num_instance)
print('Finished. check results {}'.format(out_dir))

def visualize_3d(points_gt, cameras, obj_name):
import open3d as o3d
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(points_gt)
axes = [
.create_coordinate_frame(size=0.1, origin=[0, 0, 0])
np.linalg.inv(np.concatenate((camera.extrinsic, [[0, 0, 0, 1]]), axis=0))
for camera in cameras
mesh ='vis/demo_multiview_syn/{obj_name}/0/mesh_initial.ply')

o3d.visualization.draw_geometries([pcd, mesh] + axes[:1])

def depth_to_point_cloud(depth, camera):
T_world_camUEN = np.linalg.inv(np.concatenate(
(camera.extrinsic, [[0, 0, 0, 1]]), axis=0

# DIST has weird pose convention (Up-East-North). Convert to East-Down-North
R_EDN_UEN = np.asarray([
0, 1, 0, -1, 0, 0, 0, 0, 1
]).reshape((3, 3))

T_world_camEDN = T_world_camUEN.copy()
T_world_camEDN[:3, :3] = T_world_camUEN[:3, :3] @ R_EDN_UEN.T

h, w = depth.shape

depth_pcd = o3d.geometry.PointCloud.create_from_depth_image(
width=w, height=h,
fx=camera.intrinsic[0, 0], fy=camera.intrinsic[1, 1],
cx=camera.intrinsic[0, 2], cy=camera.intrinsic[1, 2],

return depth_pcd

def visualize_renderer(
sdf_renderer, evaluator, shape_code, imgs, cameras, points_gt, obj_name,
sim3=None, sim3_init=None
viz_imgs = []
depth_pcds = []
h, w = imgs[0].shape[:2]

from core.utils.train_utils import params_to_mtrx
if sim3 is not None:
sim_mtrx = params_to_mtrx(sim3).clone()
sim_mtrx[:, 3] = torch.matmul(sim_mtrx[:3, :3].clone(), sim3_init[:, 3]) + sim_mtrx[:, 3].clone()
sim_mtrx[:3, :3] = torch.matmul(sim_mtrx[:3, :3].clone(), sim3_init[:3, :3])
sim3_scale = torch.norm(sim_mtrx[:3, :3].clone())/np.sqrt(3)
sim_mtrx = None
sim3_scale = None

extrinsics = []
for idx, (camera, img) in enumerate(zip(cameras, imgs)):
R, T = camera.extrinsic[:,:3], camera.extrinsic[:,3]
R, T = torch.from_numpy(R).float().cuda(), torch.from_numpy(T).float().cuda()

if sim3 is not None:
T = torch.matmul(R, sim_mtrx[:, 3]) + T
R = torch.matmul(R, sim_mtrx[:3, :3])
R = R / sim3_scale
T = T / sim3_scale

extrinsic = torch.from_numpy(camera.extrinsic).float().cuda()
extrinsic[:3, :3] = R
extrinsic[:3, 3] = T

depth, mask, _ = sdf_renderer.render_depth(
shape_code, R, T, no_grad=True
depth[~mask] = 0
depth = depth.reshape(h, w).contiguous()

depth_colored = depth.unsqueeze(-1).expand(-1, -1, 3) \
rgb = img.detach().cpu().numpy()
viz_img = np.concatenate((depth_colored, rgb), axis=1)
viz_img = (viz_img * 255).astype(np.uint8)
depth_pcds.append(depth_to_point_cloud(depth, camera))

viz_imgs = np.concatenate(viz_imgs, axis=0)
cv2.imwrite('/tmp/viz_imgs.png', viz_imgs)

# gt_pcd = o3d.geometry.PointCloud()
# gt_pcd.points = o3d.utility.Vector3dVector(points_gt)

T_world_camsUEN = [
(extrinsic.detach().cpu().numpy(), [[0, 0, 0, 1]]), axis=0
for extrinsic in extrinsics
axes_UEN = [
.create_coordinate_frame(size=0.1, origin=[0, 0, 0])
for T_world_cam in T_world_camsUEN

shape_code, num_points=30000, fname='/tmp/mesh_initial.ply', silent=False
mesh ='/tmp/mesh_initial.ply')

# *depth_pcds,
*axes_UEN[0:1], # gt_pcd,
o3d.geometry.TriangleMesh.create_coordinate_frame(size=1, origin=[0, 0, 0])

if __name__ == '__main__':
# seed = 123
# torch.manual_seed(seed)
# np.random.seed(seed)
# import random
# random.seed(seed)

0 comments on commit 57d8424

Please sign in to comment.