import rerun as rr import rerun.blueprint as rrb import depth_pro import subprocess import torch import cv2 import numpy as np import os from pathlib import Path import gradio as gr from gradio_rerun import Rerun import spaces # Run the script to get pretrained models if not os.path.exists("checkpoints/depth_pro.pt"): print("downloading pretrained model") subprocess.run(["bash", "get_pretrained_models.sh"]) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load model and preprocessing transform print("loading model...") model, transform = depth_pro.create_model_and_transforms() model = model.to(device) model.eval() @spaces.GPU(duration=20) def predict(frame): image = transform(frame) image = image.to(device) prediction = model.infer(image) depth = prediction["depth"].squeeze().detach().cpu().numpy() return depth, prediction["focallength_px"].item() @rr.thread_local_stream("rerun_example_ml_depth_pro") def run_rerun(path_to_video): stream = rr.binary_stream() blueprint = rrb.Blueprint( rrb.Vertical( rrb.Spatial3DView(origin="/"), rrb.Horizontal( rrb.Spatial2DView( origin="/world/camera/depth", ), rrb.Spatial2DView(origin="/world/camera/image"), ), ), collapse_panels=True, ) rr.send_blueprint(blueprint) yield stream.read() print("Loading video from", path_to_video) video = cv2.VideoCapture(path_to_video) frame_idx = -1 while True: read, frame = video.read() if not read: break frame_idx += 1 if frame_idx % 10 != 0: continue frame = cv2.resize(frame, (320, 240)) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) rr.set_time_sequence("frame", frame_idx) rr.log("world/camera/image", rr.Image(frame)) yield stream.read() depth, focal_length = estimate_depth(frame) rr.log( "world/camera", rr.Pinhole( width=frame.shape[1], height=frame.shape[0], focal_length=focal_length, principal_point=(frame.shape[1] / 2, frame.shape[0] / 2), image_plane_distance=depth.max(), ), ) rr.log( "world/camera/depth", # need 0.19 stable for this # rr.DepthImage(depth, meter=1, depth_range=(depth.min(), depth.max())), rr.DepthImage(depth, meter=1), ) yield stream.read() @spaces.GPU(duration=20) def estimate_depth(frame): image = transform(frame) image = image.to(device) prediction = model.infer(image) depth = prediction["depth"].squeeze().detach().cpu().numpy() focal_length = prediction["focallength_px"].item() return depth, focal_length with gr.Blocks() as demo: video = gr.Video(interactive=True, include_audio=False, label="Video") visualize = gr.Button("Visualize ML Depth Pro") with gr.Row(): viewer = Rerun( streaming=True, ) visualize.click(run_rerun, inputs=[video], outputs=[viewer]) if __name__ == "__main__": demo.launch()