import gradio as gr import numpy as np import cv2 from tqdm import tqdm import torch from pytorch3d.io.obj_io import load_obj import tempfile import main_mcc import mcc_model import util.misc as misc from engine_mcc import prepare_data from plyfile import PlyData, PlyElement def run_inference(model, samples, device, temperature, args): model.eval() seen_xyz, valid_seen_xyz, unseen_xyz, unseen_rgb, labels, seen_images = prepare_data( samples, device, is_train=False, args=args, is_viz=True ) pred_occupy = [] pred_colors = [] max_n_unseen_fwd = 2000 model.cached_enc_feat = None num_passes = int(np.ceil(unseen_xyz.shape[1] / max_n_unseen_fwd)) for p_idx in range(num_passes): p_start = p_idx * max_n_unseen_fwd p_end = (p_idx + 1) * max_n_unseen_fwd cur_unseen_xyz = unseen_xyz[:, p_start:p_end] cur_unseen_rgb = unseen_rgb[:, p_start:p_end].zero_() cur_labels = labels[:, p_start:p_end].zero_() with torch.no_grad(): _, pred = model( seen_images=seen_images, seen_xyz=seen_xyz, unseen_xyz=cur_unseen_xyz, unseen_rgb=cur_unseen_rgb, unseen_occupy=cur_labels, cache_enc=True, valid_seen_xyz=valid_seen_xyz, ) if device == "cuda": pred_occupy.append(pred[..., 0].cuda()) else: pred_occupy.append(pred[..., 0].cpu()) if args.regress_color: pred_colors.append(pred[..., 1:].reshape((-1, 3))) else: pred_colors.append( ( torch.nn.Softmax(dim=2)( pred[..., 1:].reshape((-1, 3, 256)) / temperature ) * torch.linspace(0, 1, 256, device=pred.device) ).sum(axis=2) ) pred_occupy = torch.cat(pred_occupy, dim=1) pred_occupy = torch.nn.Sigmoid()(pred_occupy) return torch.cat(pred_colors, dim=0).cpu().numpy(), pred_occupy.cpu().numpy(), unseen_xyz.cpu().numpy() def pad_image(im, value): if im.shape[0] > im.shape[1]: diff = im.shape[0] - im.shape[1] return torch.cat([im, (torch.zeros((im.shape[0], diff, im.shape[2])) + value)], dim=1) else: diff = im.shape[1] - im.shape[0] return torch.cat([im, (torch.zeros((diff, im.shape[1], im.shape[2])) + value)], dim=0) def backproject_depth_to_pointcloud(depth, rotation=np.eye(3), translation=np.zeros(3)): # Calculate the principal point as the center of the image principal_point = [depth.shape[1] / 2, depth.shape[0] / 2] intrinsics = get_intrinsics(depth.shape[0], depth.shape[1], principal_point) intrinsics = get_intrinsics(depth.shape[0], depth.shape[1], principal_point) # Get the depth map shape height, width = depth.shape # Create a matrix of pixel coordinates u, v = np.meshgrid(np.arange(width), np.arange(height)) uv_homogeneous = np.stack((u, v, np.ones_like(u)), axis=-1).reshape(-1, 3) # Invert the intrinsic matrix inv_intrinsics = np.linalg.inv(intrinsics) # Convert depth to the camera coordinate system points_cam_homogeneous = np.dot(uv_homogeneous, inv_intrinsics.T) * depth.flatten()[:, np.newaxis] # Convert to 3D homogeneous coordinates points_cam_homogeneous = np.concatenate((points_cam_homogeneous, np.ones((len(points_cam_homogeneous), 1))), axis=1) # Apply the rotation and translation to get the 3D point cloud in the world coordinate system extrinsics = np.hstack((rotation, translation[:, np.newaxis])) pointcloud = np.dot(points_cam_homogeneous, extrinsics.T) pointcloud[:, 1:] *= -1 # Reshape the point cloud back to the original depth map shape pointcloud = pointcloud[:, :3].reshape(height, width, 3) return pointcloud # estimate camera intrinsics def get_intrinsics(H,W, principal_point): """ Intrinsics for a pinhole camera model. Assume fov of 55 degrees and central principal point of bounding box. """ f = 0.5 * W / np.tan(0.5 * 55 * np.pi / 180.0) cx, cy = principal_point return np.array([[f, 0, cx], [0, f, cy], [0, 0, 1]]) def normalize(seen_xyz): seen_xyz = seen_xyz / (seen_xyz[torch.isfinite(seen_xyz.sum(dim=-1))].var(dim=0) ** 0.5).mean() seen_xyz = seen_xyz - seen_xyz[torch.isfinite(seen_xyz.sum(dim=-1))].mean(axis=0) return seen_xyz def infer( image, depth_image, seg, granularity, temperature, ): args.viz_granularity = granularity rgb = image depth_image = cv2.imread(depth_image.name, -1) depth_image = depth_image.astype(np.float32) / 256 seen_xyz = backproject_depth_to_pointcloud(depth_image) seen_rgb = (torch.tensor(rgb).float() / 255)[..., [2, 1, 0]] H, W = seen_rgb.shape[:2] seen_rgb = torch.nn.functional.interpolate( seen_rgb.permute(2, 0, 1)[None], size=[H, W], mode="bilinear", align_corners=False, )[0].permute(1, 2, 0) seg = cv2.imread(seg.name, cv2.IMREAD_UNCHANGED) mask = torch.tensor(cv2.resize(seg, (W, H))).bool() seen_xyz[~mask] = float('inf') seen_xyz = torch.tensor(seen_xyz).float() seen_xyz = normalize(seen_xyz) bottom, right = mask.nonzero().max(dim=0)[0] top, left = mask.nonzero().min(dim=0)[0] bottom = bottom + 40 right = right + 40 top = max(top - 40, 0) left = max(left - 40, 0) seen_xyz = seen_xyz[top:bottom+1, left:right+1] seen_rgb = seen_rgb[top:bottom+1, left:right+1] seen_xyz = pad_image(seen_xyz, float('inf')) seen_rgb = pad_image(seen_rgb, 0) seen_rgb = torch.nn.functional.interpolate( seen_rgb.permute(2, 0, 1)[None], size=[800, 800], mode="bilinear", align_corners=False, ) seen_xyz = torch.nn.functional.interpolate( seen_xyz.permute(2, 0, 1)[None], size=[112, 112], mode="bilinear", align_corners=False, ).permute(0, 2, 3, 1) samples = [ [seen_xyz, seen_rgb], [torch.zeros((20000, 3)), torch.zeros((20000, 3))], ] pred_colors, pred_occupy, unseen_xyz = run_inference(model, samples, device, temperature, args) _masks = pred_occupy > 0.1 unseen_xyz = unseen_xyz[_masks] pred_colors = pred_colors[None, ...][_masks] * 255 # Prepare data for PlyElement vertex = np.core.records.fromarrays(np.hstack((unseen_xyz, pred_colors)).transpose(), names='x, y, z, red, green, blue', formats='f8, f8, f8, u1, u1, u1') # Create PlyElement element = PlyElement.describe(vertex, 'vertex') # Save point cloud data to a temporary file with tempfile.NamedTemporaryFile(suffix=".ply", delete=False) as f: PlyData([element], text=True).write(f) temp_file_name = f.name return temp_file_name if __name__ == '__main__': device = "cuda" if torch.cuda.is_available() else "cpu" parser = main_mcc.get_args_parser() parser.set_defaults(eval=True) args = parser.parse_args() model = mcc_model.get_mcc_model( occupancy_weight=1.0, rgb_weight=0.01, args=args, ) if device == "cuda": model = model.cuda() misc.load_model(args=args, model_without_ddp=model, optimizer=None, loss_scaler=None) demo = gr.Interface(fn=infer, inputs=[gr.Image(label="Input Image"), gr.File(label="Depth Image"), gr.File(label="Segmentation File"), gr.Slider(minimum=0.05, maximum=0.5, step=0.05, value=0.2, label="Grain Size"), gr.Slider(minimum=0, maximum=1.0, step=0.1, value=0.1, label="Color Temperature") ], outputs=[gr.outputs.File(label="Point Cloud")], examples=[["demo/quest2.jpg", "demo/quest2_depth.png", "demo/quest2_seg.png", 0.2, 0.1]], cache_examples=True) demo.launch(server_name="0.0.0.0", server_port=7860)