import gradio as gr import numpy as np from torchvision import transforms import torch from helpers import * import sys import csv from monoscene.monoscene import MonoScene csv.field_size_limit(sys.maxsize) torch.set_grad_enabled(False) model = MonoScene.load_from_checkpoint( "monoscene_kitti.ckpt", dataset="kitti", n_classes=20, feature = 64, project_scale = 4, full_scene_size = (256, 256, 32), ) img_W, img_H = 1220, 370 def predict(img): img = np.array(img, dtype=np.float32, copy=False) / 255.0 normalize_rgb = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ] ) img = normalize_rgb(img) batch = get_projections(img_W, img_H) batch["img"] = img for k in batch: batch[k] = batch[k].unsqueeze(0)#.cuda() pred = model(batch).squeeze() fig = draw(pred, batch['fov_mask_2']) return fig description = """ MonoScene Demo on SemanticKITTI Validation Set (Sequence 08), which uses the camera parameters of Sequence 08. Due to the CPU-only inference, it might take up to 20s to predict a scene. \n This is a smaller model with half resolution and w/o 3D CRP. You can find the full model at: https://huggingface.co/spaces/CVPR/MonoScene