Spaces:

kxic
/

EscherNet

Running on Zero

App Files Files Community

kxhit commited on Jun 11

Commit

5ca3a35

•

1 Parent(s): d161cfd

cuda reinit?

Browse files

Files changed (32) hide show

app.py +2 -2
app_bk.py +786 -0
mini_dust3r/__init__.py +0 -0
mini_dust3r/api/__init__.py +3 -0
mini_dust3r/api/inference.py +225 -0
mini_dust3r/cloud_opt/__init__.py +44 -0
mini_dust3r/cloud_opt/base_opt.py +390 -0
mini_dust3r/cloud_opt/commons.py +90 -0
mini_dust3r/cloud_opt/init_im_poses.py +316 -0
mini_dust3r/cloud_opt/modular_optimizer.py +145 -0
mini_dust3r/cloud_opt/optimizer.py +248 -0
mini_dust3r/cloud_opt/pair_viewer.py +127 -0
mini_dust3r/croco/blocks.py +241 -0
mini_dust3r/croco/croco.py +249 -0
mini_dust3r/croco/dpt_block.py +450 -0
mini_dust3r/croco/masking.py +25 -0
mini_dust3r/croco/pos_embed.py +159 -0
mini_dust3r/heads/__init__.py +19 -0
mini_dust3r/heads/dpt_head.py +114 -0
mini_dust3r/heads/linear_head.py +41 -0
mini_dust3r/heads/postprocess.py +58 -0
mini_dust3r/image_pairs.py +85 -0
mini_dust3r/inference.py +204 -0
mini_dust3r/model.py +259 -0
mini_dust3r/optim_factory.py +14 -0
mini_dust3r/patch_embed.py +69 -0
mini_dust3r/post_process.py +60 -0
mini_dust3r/utils/device.py +76 -0
mini_dust3r/utils/geometry.py +361 -0
mini_dust3r/utils/image.py +141 -0
mini_dust3r/utils/misc.py +121 -0
mini_dust3r/viz.py +320 -0

app.py CHANGED Viewed

@@ -268,7 +268,7 @@ from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_
 from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
 import math
-@spaces.GPU(duration=120)
 def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
                                  cam_color=None, as_pointcloud=False,
                                  transparent_cams=False, silent=False, same_focals=False):
@@ -321,7 +321,7 @@ def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world,
     scene.export(file_obj=outfile)
     return outfile
-@spaces.GPU(duration=120)
 def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
                             clean_depth=False, transparent_cams=False, cam_size=0.05, same_focals=False):
     """

 from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
 import math
+# @spaces.GPU(duration=120)
 def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
                                  cam_color=None, as_pointcloud=False,
                                  transparent_cams=False, silent=False, same_focals=False):
     scene.export(file_obj=outfile)
     return outfile
+# @spaces.GPU(duration=120)
 def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
                             clean_depth=False, transparent_cams=False, cam_size=0.05, same_focals=False):
     """

app_bk.py ADDED Viewed

	@@ -0,0 +1,786 @@

+import spaces
+import torch
+print("cuda is available: ", torch.cuda.is_available())
+import gradio as gr
+import os
+import shutil
+import rembg
+import numpy as np
+import math
+import open3d as o3d
+from PIL import Image
+import torchvision
+import trimesh
+from skimage.io import imsave
+import imageio
+import cv2
+import matplotlib.pyplot as pl
+pl.ion()
+CaPE_TYPE = "6DoF"
+device = 'cuda' #if torch.cuda.is_available() else 'cpu'
+weight_dtype = torch.float16
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+# EscherNet
+# create angles in archimedean spiral with N steps
+def get_archimedean_spiral(sphere_radius, num_steps=250):
+    # x-z plane, around upper y
+    '''
+    https://en.wikipedia.org/wiki/Spiral, section "Spherical spiral". c = a / pi
+    '''
+    a = 40
+    r = sphere_radius
+    translations = []
+    angles = []
+    # i = a / 2
+    i = 0.01
+    while i < a:
+        theta = i / a * math.pi
+        x = r * math.sin(theta) * math.cos(-i)
+        z = r * math.sin(-theta + math.pi) * math.sin(-i)
+        y = r * - math.cos(theta)
+        # translations.append((x, y, z))    # origin
+        translations.append((x, z, -y))
+        angles.append([np.rad2deg(-i), np.rad2deg(theta)])
+        # i += a / (2 * num_steps)
+        i += a / (1 * num_steps)
+    return np.array(translations), np.stack(angles)
+def look_at(origin, target, up):
+    forward = (target - origin)
+    forward = forward / np.linalg.norm(forward)
+    right = np.cross(up, forward)
+    right = right / np.linalg.norm(right)
+    new_up = np.cross(forward, right)
+    rotation_matrix = np.column_stack((right, new_up, -forward, target))
+    matrix = np.row_stack((rotation_matrix, [0, 0, 0, 1]))
+    return matrix
+import einops
+import sys
+sys.path.insert(0, "./6DoF/")   # TODO change it when deploying
+# use the customized diffusers modules
+from diffusers import DDIMScheduler
+from dataset import get_pose
+from CN_encoder import CN_encoder
+from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline
+from segment_anything import sam_model_registry, SamPredictor
+# import rembg
+from carvekit.api.high import HiInterface
+pretrained_model_name_or_path = "kxic/EscherNet_demo"
+resolution = 256
+h,w = resolution,resolution
+guidance_scale = 3.0
+radius = 2.2
+bg_color = [1., 1., 1., 1.]
+image_transforms = torchvision.transforms.Compose(
+        [
+            torchvision.transforms.Resize((resolution, resolution)),  # 256, 256
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize([0.5], [0.5])
+        ]
+    )
+xyzs_spiral, angles_spiral = get_archimedean_spiral(1.5, 200)
+# only half toop
+xyzs_spiral = xyzs_spiral[:100]
+angles_spiral = angles_spiral[:100]
+# Init pipeline
+scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler", revision=None)
+image_encoder = CN_encoder.from_pretrained(pretrained_model_name_or_path, subfolder="image_encoder", revision=None)
+pipeline = Zero1to3StableDiffusionPipeline.from_pretrained(
+    pretrained_model_name_or_path,
+    revision=None,
+    scheduler=scheduler,
+    image_encoder=None,
+    safety_checker=None,
+    feature_extractor=None,
+    torch_dtype=weight_dtype,
+)
+pipeline.image_encoder = image_encoder.to(weight_dtype)
+pipeline.set_progress_bar_config(disable=False)
+pipeline = pipeline.to(device)
+# pipeline.enable_xformers_memory_efficient_attention()
+# enable vae slicing
+pipeline.enable_vae_slicing()
+# pipeline.enable_xformers_memory_efficient_attention()
+#### object segmentation
+def sam_init():
+    sam_checkpoint = os.path.join("./sam_pt/sam_vit_h_4b8939.pth")
+    if os.path.exists(sam_checkpoint) is False:
+        os.system("wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth -P ./sam_pt/")
+    model_type = "vit_h"
+    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=device)
+    predictor = SamPredictor(sam)
+    return predictor
+def create_carvekit_interface():
+    # Check doc strings for more information
+    interface = HiInterface(object_type="object",  # Can be "object" or "hairs-like".
+                            batch_size_seg=6,
+                            batch_size_matting=1,
+                            device="cpu",
+                            seg_mask_size=640,  # Use 640 for Tracer B7 and 320 for U2Net
+                            matting_mask_size=2048,
+                            trimap_prob_threshold=231,
+                            trimap_dilation=30,
+                            trimap_erosion_iters=5,
+                            fp16=True)
+    return interface
+# rembg_session = rembg.new_session()
+rembg_session = create_carvekit_interface()
+predictor = sam_init()
+@spaces.GPU(duration=120)
+def run_eschernet(eschernet_input_dict, sample_steps, sample_seed, nvs_num, nvs_mode):
+    # set the random seed
+    generator = torch.Generator(device=device).manual_seed(sample_seed)
+    # generator = None
+    T_out = nvs_num
+    T_in = len(eschernet_input_dict['imgs'])
+    ####### output pose
+    # TODO choose T_out number of poses sequentially from the spiral
+    xyzs = xyzs_spiral[::(len(xyzs_spiral) // T_out)]
+    angles_out = angles_spiral[::(len(xyzs_spiral) // T_out)]
+    ####### input's max radius for translation scaling
+    radii = eschernet_input_dict['radii']
+    max_t = np.max(radii)
+    min_t = np.min(radii)
+    ####### input pose
+    pose_in = []
+    for T_in_index in range(T_in):
+        pose = get_pose(np.linalg.inv(eschernet_input_dict['poses'][T_in_index]))
+        pose[1:3, :] *= -1   # coordinate system conversion
+        pose[3, 3] *= 1. / max_t * radius    # scale radius to [1.5, 2.2]
+        pose_in.append(torch.from_numpy(pose))
+    ####### input image
+    img = eschernet_input_dict['imgs'] / 255.
+    img[img[:, :, :, -1] == 0.] = bg_color
+    # TODO batch image_transforms
+    input_image = [image_transforms(Image.fromarray(np.uint8(im[:, :, :3] * 255.)).convert("RGB")) for im in img]
+    ####### nvs pose
+    pose_out = []
+    for T_out_index in range(T_out):
+        azimuth, polar = angles_out[T_out_index]
+        if CaPE_TYPE == "4DoF":
+            pose_out.append(torch.tensor([np.deg2rad(polar), np.deg2rad(azimuth), 0., 0.]))
+        elif CaPE_TYPE == "6DoF":
+            pose = look_at(origin=np.array([0, 0, 0]), target=xyzs[T_out_index], up=np.array([0, 0, 1]))
+            pose = np.linalg.inv(pose)
+            pose[2, :] *= -1
+            pose_out.append(torch.from_numpy(get_pose(pose)))
+    # [B, T, C, H, W]
+    input_image = torch.stack(input_image, dim=0).to(device).to(weight_dtype).unsqueeze(0)
+    # [B, T, 4]
+    pose_in = np.stack(pose_in)
+    pose_out = np.stack(pose_out)
+    if CaPE_TYPE == "6DoF":
+        pose_in_inv = np.linalg.inv(pose_in).transpose([0, 2, 1])
+        pose_out_inv = np.linalg.inv(pose_out).transpose([0, 2, 1])
+        pose_in_inv = torch.from_numpy(pose_in_inv).to(device).to(weight_dtype).unsqueeze(0)
+        pose_out_inv = torch.from_numpy(pose_out_inv).to(device).to(weight_dtype).unsqueeze(0)
+    pose_in = torch.from_numpy(pose_in).to(device).to(weight_dtype).unsqueeze(0)
+    pose_out = torch.from_numpy(pose_out).to(device).to(weight_dtype).unsqueeze(0)
+    input_image = einops.rearrange(input_image, "b t c h w -> (b t) c h w")
+    assert T_in == input_image.shape[0]
+    assert T_in == pose_in.shape[1]
+    assert T_out == pose_out.shape[1]
+    # run inference
+    # pipeline.to(device)
+    pipeline.enable_xformers_memory_efficient_attention()
+    image = pipeline(input_imgs=input_image, prompt_imgs=input_image,
+                         poses=[[pose_out, pose_out_inv], [pose_in, pose_in_inv]],
+                         height=h, width=w, T_in=T_in, T_out=T_out,
+                         guidance_scale=guidance_scale, num_inference_steps=50, generator=generator,
+                         output_type="numpy").images
+    # save output image
+    output_dir = os.path.join(tmpdirname, "eschernet")
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    # # save to N imgs
+    # for i in range(T_out):
+    #     imsave(os.path.join(output_dir, f'{i}.png'), (image[i] * 255).astype(np.uint8))
+    # make a gif
+    frames = [Image.fromarray((image[i] * 255).astype(np.uint8)) for i in range(T_out)]
+    # frame_one = frames[0]
+    # frame_one.save(os.path.join(output_dir, "output.gif"), format="GIF", append_images=frames,
+    #                save_all=True, duration=50, loop=1)
+    # get a video
+    video_path = os.path.join(output_dir, "output.mp4")
+    imageio.mimwrite(video_path, np.stack(frames), fps=10, codec='h264')
+    return video_path
+# TODO mesh it
+@spaces.GPU(duration=120)
+def make3d():
+    pass
+############################ Dust3r as Pose Estimation ############################
+from scipy.spatial.transform import Rotation
+import copy
+from dust3r.inference import inference
+from dust3r.model import AsymmetricCroCo3DStereo
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images, rgb
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+import math
+@spaces.GPU(duration=120)
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+                                 cam_color=None, as_pointcloud=False,
+                                 transparent_cams=False, silent=False, same_focals=False):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world)
+    if not same_focals:
+        assert (len(cams2world) == len(focals))
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+    scene = trimesh.Scene()
+    # add axes
+    scene.add_geometry(trimesh.creation.axis(axis_length=0.5, axis_radius=0.001))
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        if same_focals:
+            focal = focals[0]
+        else:
+            focal = focals[i]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else imgs[i], focal,
+                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    outfile = os.path.join(outdir, 'scene.glb')
+    if not silent:
+        print('(exporting 3D scene to', outfile, ')')
+    scene.export(file_obj=outfile)
+    return outfile
+@spaces.GPU(duration=120)
+def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
+                            clean_depth=False, transparent_cams=False, cam_size=0.05, same_focals=False):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene is None:
+        return None
+    # post processes
+    if clean_depth:
+        scene = scene.clean_pointcloud()
+    if mask_sky:
+        scene = scene.mask_sky()
+    # get optimized values from scene
+    rgbimg = to_numpy(scene.imgs)
+    focals = to_numpy(scene.get_focals().cpu())
+    # cams2world = to_numpy(scene.get_im_poses().cpu())
+    # TODO use the vis_poses
+    cams2world = scene.vis_poses
+    # 3D pointcloud from depthmap, poses and intrinsics
+    # pts3d = to_numpy(scene.get_pts3d())
+    # TODO use the vis_poses
+    pts3d = scene.vis_pts3d
+    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
+    msk = to_numpy(scene.get_masks())
+    return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent,
+                                        same_focals=same_focals)
+@spaces.GPU(duration=120)
+def get_reconstructed_scene(filelist, schedule, niter, min_conf_thr,
+                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+                            scenegraph_type, winsize, refid, same_focals):
+    """
+    from a list of images, run dust3r inference, global aligner.
+    then run get_3D_model_from_scene
+    """
+    silent = False
+    image_size = 224
+    # remove the directory if it already exists
+    outdir = tmpdirname
+    if os.path.exists(outdir):
+        shutil.rmtree(outdir)
+    os.makedirs(outdir, exist_ok=True)
+    imgs, imgs_rgba = load_images(filelist, size=image_size, verbose=not silent, do_remove_background=True, rembg_session=rembg_session, predictor=predictor)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+    if scenegraph_type == "swin":
+        scenegraph_type = scenegraph_type + "-" + str(winsize)
+    elif scenegraph_type == "oneref":
+        scenegraph_type = scenegraph_type + "-" + str(refid)
+    pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+    output = inference(pairs, model, device, batch_size=1, verbose=not silent)
+    mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
+    scene = global_aligner(output, device=device, mode=mode, verbose=not silent, same_focals=same_focals)
+    lr = 0.01
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+    # outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
+    #                                   clean_depth, transparent_cams, cam_size, same_focals=same_focals)
+    # also return rgb, depth and confidence imgs
+    # depth is normalized with the max value for all images
+    # we apply the jet colormap on the confidence maps
+    rgbimg = scene.imgs
+    # depths = to_numpy(scene.get_depthmaps())
+    # confs = to_numpy([c for c in scene.im_conf])
+    # cmap = pl.get_cmap('jet')
+    # depths_max = max([d.max() for d in depths])
+    # depths = [d / depths_max for d in depths]
+    # confs_max = max([d.max() for d in confs])
+    # confs = [cmap(d / confs_max) for d in confs]
+    imgs = []
+    rgbaimg = []
+    for i in range(len(rgbimg)):   # when only 1 image, scene.imgs is two
+        imgs.append(rgbimg[i])
+        # imgs.append(rgb(depths[i]))
+        # imgs.append(rgb(confs[i]))
+        # imgs.append(imgs_rgba[i])
+        if len(imgs_rgba) == 1 and i == 1:
+            imgs.append(imgs_rgba[0])
+            rgbaimg.append(np.array(imgs_rgba[0]))
+        else:
+            imgs.append(imgs_rgba[i])
+            rgbaimg.append(np.array(imgs_rgba[i]))
+    rgbaimg = np.array(rgbaimg)
+    # for eschernet
+    # get optimized values from scene
+    rgbimg = to_numpy(scene.imgs)
+    # focals = to_numpy(scene.get_focals().cpu())
+    cams2world = to_numpy(scene.get_im_poses().cpu())
+    # 3D pointcloud from depthmap, poses and intrinsics
+    pts3d = to_numpy(scene.get_pts3d())
+    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
+    msk = to_numpy(scene.get_masks())
+    obj_mask = rgbaimg[..., 3] > 0
+    # TODO set global coordinate system at the center of the scene, z-axis is up
+    pts = np.concatenate([p[m] for p, m in zip(pts3d, msk)]).reshape(-1, 3)
+    pts_obj = np.concatenate([p[m&obj_m] for p, m, obj_m in zip(pts3d, msk, obj_mask)]).reshape(-1, 3)
+    centroid = np.mean(pts_obj, axis=0) # obj center
+    obj2world = np.eye(4)
+    obj2world[:3, 3] = -centroid  # T_wc
+    # get z_up vector
+    # TODO fit a plane and get the normal vector
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(pts)
+    plane_model, inliers = pcd.segment_plane(distance_threshold=0.01, ransac_n=3, num_iterations=1000)
+    # get the normalised normal vector dim = 3
+    normal = plane_model[:3] / np.linalg.norm(plane_model[:3])
+    # the normal direction should be pointing up
+    if normal[1] < 0:
+        normal = -normal
+    # print("normal", normal)
+    # # TODO z-up 180
+    # z_up = np.array([[1,0,0,0],
+    #                       [0,-1,0,0],
+    #                       [0,0,-1,0],
+    #                       [0,0,0,1]])
+    # obj2world = z_up @ obj2world
+    # # avg the y
+    # z_up_avg = cams2world[:,:3,3].sum(0) / np.linalg.norm(cams2world[:,:3,3].sum(0), axis=-1)    # average direction in cam coordinate
+    # # import pdb; pdb.set_trace()
+    # rot_axis = np.cross(np.array([0, 0, 1]), z_up_avg)
+    # rot_angle = np.arccos(np.dot(np.array([0, 0, 1]), z_up_avg) / (np.linalg.norm(z_up_avg) + 1e-6))
+    # rot = Rotation.from_rotvec(rot_angle * rot_axis)
+    # z_up = np.eye(4)
+    # z_up[:3, :3] = rot.as_matrix()
+    # get the rotation matrix from normal to z-axis
+    z_axis = np.array([0, 0, 1])
+    rot_axis = np.cross(normal, z_axis)
+    rot_angle = np.arccos(np.dot(normal, z_axis) / (np.linalg.norm(normal) + 1e-6))
+    rot = Rotation.from_rotvec(rot_angle * rot_axis)
+    z_up = np.eye(4)
+    z_up[:3, :3] = rot.as_matrix()
+    obj2world = z_up @ obj2world
+    # flip 180
+    flip_rot = np.array([[1, 0, 0, 0],
+                         [0, -1, 0, 0],
+                         [0, 0, -1, 0],
+                         [0, 0, 0, 1]])
+    obj2world = flip_rot @ obj2world
+    # get new cams2obj
+    cams2obj = []
+    for i, cam2world in enumerate(cams2world):
+        cams2obj.append(obj2world @ cam2world)
+    # TODO transform pts3d to the new coordinate system
+    for i, pts in enumerate(pts3d):
+        pts3d[i] = (obj2world @ np.concatenate([pts, np.ones_like(pts)[..., :1]], axis=-1).transpose(2, 0, 1).reshape(4,
+                                                                                                                      -1)) \
+                       .reshape(4, pts.shape[0], pts.shape[1]).transpose(1, 2, 0)[..., :3]
+    cams2world = np.array(cams2obj)
+    # TODO rewrite hack
+    scene.vis_poses = cams2world.copy()
+    scene.vis_pts3d = pts3d.copy()
+    # TODO save cams2world and rgbimg to each file, file name "000.npy", "001.npy", ... and "000.png", "001.png", ...
+    for i, (img, img_rgba, pose) in enumerate(zip(rgbimg, rgbaimg, cams2world)):
+        np.save(os.path.join(outdir, f"{i:03d}.npy"), pose)
+        pl.imsave(os.path.join(outdir, f"{i:03d}.png"), img)
+        pl.imsave(os.path.join(outdir, f"{i:03d}_rgba.png"), img_rgba)
+        # np.save(os.path.join(outdir, f"{i:03d}_focal.npy"), to_numpy(focal))
+    # save the min/max radius of camera
+    radii = np.linalg.norm(np.linalg.inv(cams2world)[..., :3, 3])
+    np.save(os.path.join(outdir, "radii.npy"), radii)
+    eschernet_input = {"poses": cams2world,
+                       "radii": radii,
+                       "imgs": rgbaimg}
+    print("got eschernet input")
+    outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
+                                      clean_depth, transparent_cams, cam_size, same_focals=same_focals)
+    return scene, outfile, imgs, eschernet_input
+def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
+    num_files = len(inputfiles) if inputfiles is not None else 1
+    max_winsize = max(1, math.ceil((num_files - 1) / 2))
+    if scenegraph_type == "swin":
+        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=True)
+        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    elif scenegraph_type == "oneref":
+        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=True)
+    else:
+        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    return winsize, refid
+def get_examples(path):
+    objs = []
+    for obj_name in sorted(os.listdir(path)):
+        img_files = []
+        for img_file in sorted(os.listdir(os.path.join(path, obj_name))):
+            img_files.append(os.path.join(path, obj_name, img_file))
+        objs.append([img_files])
+    print("objs = ", objs)
+    return objs
+def preview_input(inputfiles):
+    if inputfiles is None:
+        return None
+    imgs = []
+    for img_file in inputfiles:
+        img = pl.imread(img_file)
+        imgs.append(img)
+    return imgs
+# def main():
+# dustr init
+silent = False
+image_size = 224
+weights_path = 'checkpoints/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth'
+model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(device)
+# dust3r will write the 3D model inside tmpdirname
+# with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
+tmpdirname = os.path.join('logs/user_object')
+# remove the directory if it already exists
+if os.path.exists(tmpdirname):
+    shutil.rmtree(tmpdirname)
+os.makedirs(tmpdirname, exist_ok=True)
+if not silent:
+    print('Outputing stuff in', tmpdirname)
+_HEADER_ = '''
+<h2><b>[CVPR'24 Oral] EscherNet: A Generative Model for Scalable View Synthesis</b></h2>
+<b>EscherNet</b> is a multiview diffusion model for scalable generative any-to-any number/pose novel view synthesis.
+Image views are treated as tokens and the camera pose is encoded by <b>CaPE (Camera Positional Encoding)</b>.
+<a href='https://kxhit.github.io/EscherNet' target='_blank'>Project</a> <b>|</b>
+<a href='https://github.com/kxhit/EscherNet' target='_blank'>GitHub</a> <b>|</b>
+<a href='https://arxiv.org/abs/2402.03908' target='_blank'>ArXiv</a>
+<h4><b>Tips:</b></h4>
+- Our model can take <b>any number input images</b>. The more images you provide <b>(>=3 for this demo)</b>, the better the results.
+- Our model can generate <b>any number and any pose</b> novel views. You can specify the number of views you want to generate. In this demo, we set novel views on an <b>archemedian spiral</b> for simplicity.
+- The pose estimation is done using <a href='https://github.com/naver/dust3r' target='_blank'>DUSt3R</a>. You can also provide your own poses or get pose via any SLAM system.
+- The current checkpoint supports 6DoF camera pose and is trained on 30k 3D <a href='https://objaverse.allenai.org/' target='_blank'>Objaverse</a> objects for demo. Scaling is on the roadmap!
+'''
+_CITE_ = r"""
+📝 <b>Citation</b>:
+```bibtex
+@article{kong2024eschernet,
+    title={EscherNet: A Generative Model for Scalable View Synthesis},
+    author={Kong, Xin and Liu, Shikun and Lyu, Xiaoyang and Taher, Marwan and Qi, Xiaojuan and Davison, Andrew J},
+    journal={arXiv preprint arXiv:2402.03908},
+    year={2024}
+    }
+```
+"""
+with gr.Blocks() as demo:
+    gr.Markdown(_HEADER_)
+    # mv_images = gr.State()
+    scene = gr.State(None)
+    eschernet_input = gr.State(None)
+    with gr.Row(variant="panel"):
+        # left column
+        with gr.Column():
+            with gr.Row():
+                input_image = gr.File(file_count="multiple")
+            with gr.Row():
+                run_dust3r = gr.Button("Get Pose!", elem_id="dust3r")
+            with gr.Row():
+                processed_image = gr.Gallery(label='Input Views', columns=2, height="100%")
+            with gr.Row(variant="panel"):
+                # input examples under "examples" folder
+                gr.Examples(
+                    examples=get_examples('examples'),
+                    inputs=[input_image],
+                    label="Examples (click one set of images to start!)",
+                    examples_per_page=20
+                )
+        # right column
+        with gr.Column():
+            with gr.Row():
+                outmodel = gr.Model3D()
+            with gr.Row():
+                gr.Markdown('''
+                <h4><b>Check if the pose (blue is axis is estimated z-up direction) and segmentation looks correct. If not, remove the incorrect images and try again.</b></h4>
+                ''')
+            with gr.Row():
+                with gr.Group():
+                    do_remove_background = gr.Checkbox(
+                        label="Remove Background", value=True
+                    )
+                    sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
+                    sample_steps = gr.Slider(
+                        label="Sample Steps",
+                        minimum=30,
+                        maximum=75,
+                        value=50,
+                        step=5,
+                        visible=False
+                    )
+                    nvs_num = gr.Slider(
+                        label="Number of Novel Views",
+                        minimum=5,
+                        maximum=100,
+                        value=30,
+                        step=1
+                    )
+                    nvs_mode = gr.Dropdown(["archimedes circle"],   # "fixed 4 views", "fixed 8 views"
+                                       value="archimedes circle", label="Novel Views Pose Chosen", visible=True)
+            with gr.Row():
+                gr.Markdown('''
+                <h4><b>Choose your desired novel view poses number and generate! The more output images the longer it takes.</b></h4>
+                ''')
+            with gr.Row():
+                submit = gr.Button("Submit", elem_id="eschernet", variant="primary")
+            with gr.Row():
+                with gr.Column():
+                    output_video = gr.Video(
+                        label="video", format="mp4",
+                        width=379,
+                        autoplay=True,
+                        interactive=False
+                    )
+            with gr.Row():
+                gr.Markdown('''
+                <h4><b>The novel views are generated on an archimedean spiral (rotating around z-up axis and looking at the object center). You can download the video.</b></h4>
+                ''')
+    gr.Markdown(_CITE_)
+    # set dust3r parameter invisible to be clean
+    with gr.Column():
+        with gr.Row():
+            schedule = gr.Dropdown(["linear", "cosine"],
+                                       value='linear', label="schedule", info="For global alignment!", visible=False)
+            niter = gr.Number(value=300, precision=0, minimum=0, maximum=5000,
+                                  label="num_iterations", info="For global alignment!", visible=False)
+            scenegraph_type = gr.Dropdown(["complete", "swin", "oneref"],
+                                              value='complete', label="Scenegraph",
+                                              info="Define how to make pairs",
+                                              interactive=True, visible=False)
+            same_focals = gr.Checkbox(value=True, label="Focal", info="Use the same focal for all cameras", visible=False)
+            winsize = gr.Slider(label="Scene Graph: Window Size", value=1,
+                                    minimum=1, maximum=1, step=1, visible=False)
+            refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
+        with gr.Row():
+            # adjust the confidence threshold
+            min_conf_thr = gr.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1, visible=False)
+            # adjust the camera size in the output pointcloud
+            cam_size = gr.Slider(label="cam_size", value=0.05, minimum=0.01, maximum=0.5, step=0.001, visible=False)
+        with gr.Row():
+            as_pointcloud = gr.Checkbox(value=False, label="As pointcloud", visible=False)
+            # two post process implemented
+            mask_sky = gr.Checkbox(value=False, label="Mask sky", visible=False)
+            clean_depth = gr.Checkbox(value=True, label="Clean-up depthmaps", visible=False)
+            transparent_cams = gr.Checkbox(value=False, label="Transparent cameras", visible=False)
+    # events
+    # scenegraph_type.change(set_scenegraph_options,
+    #                        inputs=[input_image, winsize, refid, scenegraph_type],
+    #                        outputs=[winsize, refid])
+    # min_conf_thr.release(fn=model_from_scene_fun,
+    #                      inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+    #                              clean_depth, transparent_cams, cam_size, same_focals],
+    #                      outputs=outmodel)
+    # cam_size.change(fn=model_from_scene_fun,
+    #                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+    #                         clean_depth, transparent_cams, cam_size, same_focals],
+    #                 outputs=outmodel)
+    # as_pointcloud.change(fn=model_from_scene_fun,
+    #                      inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+    #                              clean_depth, transparent_cams, cam_size, same_focals],
+    #                      outputs=outmodel)
+    # mask_sky.change(fn=model_from_scene_fun,
+    #                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+    #                         clean_depth, transparent_cams, cam_size, same_focals],
+    #                 outputs=outmodel)
+    # clean_depth.change(fn=model_from_scene_fun,
+    #                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+    #                            clean_depth, transparent_cams, cam_size, same_focals],
+    #                    outputs=outmodel)
+    # transparent_cams.change(model_from_scene_fun,
+    #                         inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+    #                                 clean_depth, transparent_cams, cam_size, same_focals],
+    #                         outputs=outmodel)
+    # run_dust3r.click(fn=recon_fun,
+    #               inputs=[input_image, schedule, niter, min_conf_thr, as_pointcloud,
+    #                       mask_sky, clean_depth, transparent_cams, cam_size,
+    #                       scenegraph_type, winsize, refid, same_focals],
+    #               outputs=[scene, outmodel, processed_image, eschernet_input])
+    # events
+    input_image.change(set_scenegraph_options,
+                       inputs=[input_image, winsize, refid, scenegraph_type],
+                       outputs=[winsize, refid])
+    run_dust3r.click(fn=get_reconstructed_scene,
+                     inputs=[input_image, schedule, niter, min_conf_thr, as_pointcloud,
+                             mask_sky, clean_depth, transparent_cams, cam_size,
+                             scenegraph_type, winsize, refid, same_focals],
+                     outputs=[scene, outmodel, processed_image, eschernet_input])
+    # events
+    input_image.change(fn=preview_input,
+                       inputs=[input_image],
+                       outputs=[processed_image])
+    submit.click(fn=run_eschernet,
+                 inputs=[eschernet_input, sample_steps, sample_seed,
+                         nvs_num, nvs_mode],
+                 outputs=[output_video])
+# demo.queue(max_size=10)
+# demo.launch(share=True, server_name="0.0.0.0", server_port=None)
+demo.queue(max_size=10).launch()
+# if __name__ == '__main__':
+#     main()

mini_dust3r/__init__.py ADDED Viewed

File without changes

mini_dust3r/api/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .inference import inferece_dust3r, OptimizedResult, log_optimized_result
2	+
3	+ __all__ = ["inferece_dust3r", "OptimizedResult", "log_optimized_result"]

mini_dust3r/api/inference.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import rerun as rr
+from pathlib import Path
+from typing import Literal
+import copy
+import torch
+import numpy as np
+from jaxtyping import Float32, Bool
+import trimesh
+from tqdm import tqdm
+from mini_dust3r.utils.image import load_images, ImageDict
+from mini_dust3r.inference import inference, Dust3rResult
+from mini_dust3r.model import AsymmetricCroCo3DStereo
+from mini_dust3r.image_pairs import make_pairs
+from mini_dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
+from mini_dust3r.viz import pts3d_to_trimesh, cat_meshes
+from dataclasses import dataclass
+@dataclass
+class OptimizedResult:
+    K_b33: Float32[np.ndarray, "b 3 3"]
+    world_T_cam_b44: Float32[np.ndarray, "b 4 4"]
+    rgb_hw3_list: list[Float32[np.ndarray, "h w 3"]]
+    depth_hw_list: list[Float32[np.ndarray, "h w"]]
+    conf_hw_list: list[Float32[np.ndarray, "h w"]]
+    masks_list: Bool[np.ndarray, "h w"]
+    point_cloud: trimesh.PointCloud
+    mesh: trimesh.Trimesh
+def log_optimized_result(
+    optimized_result: OptimizedResult, parent_log_path: Path
+) -> None:
+    rr.log(f"{parent_log_path}", rr.ViewCoordinates.RDF, timeless=True)
+    # log pointcloud
+    rr.log(
+        f"{parent_log_path}/pointcloud",
+        rr.Points3D(
+            positions=optimized_result.point_cloud.vertices,
+            colors=optimized_result.point_cloud.colors,
+        ),
+        timeless=True,
+    )
+    mesh = optimized_result.mesh
+    rr.log(
+        f"{parent_log_path}/mesh",
+        rr.Mesh3D(
+            vertex_positions=mesh.vertices,
+            vertex_colors=mesh.visual.vertex_colors,
+            indices=mesh.faces,
+        ),
+        timeless=True,
+    )
+    pbar = tqdm(
+        zip(
+            optimized_result.rgb_hw3_list,
+            optimized_result.depth_hw_list,
+            optimized_result.K_b33,
+            optimized_result.world_T_cam_b44,
+        ),
+        total=len(optimized_result.rgb_hw3_list),
+    )
+    for i, (rgb_hw3, depth_hw, k_33, world_T_cam_44) in enumerate(pbar):
+        camera_log_path = f"{parent_log_path}/camera_{i}"
+        height, width, _ = rgb_hw3.shape
+        rr.log(
+            f"{camera_log_path}",
+            rr.Transform3D(
+                translation=world_T_cam_44[:3, 3],
+                mat3x3=world_T_cam_44[:3, :3],
+                from_parent=False,
+            ),
+        )
+        rr.log(
+            f"{camera_log_path}/pinhole",
+            rr.Pinhole(
+                image_from_camera=k_33,
+                height=height,
+                width=width,
+                camera_xyz=rr.ViewCoordinates.RDF,
+            ),
+        )
+        rr.log(
+            f"{camera_log_path}/pinhole/rgb",
+            rr.Image(rgb_hw3),
+        )
+        rr.log(
+            f"{camera_log_path}/pinhole/depth",
+            rr.DepthImage(depth_hw),
+        )
+def scene_to_results(scene: BasePCOptimizer, min_conf_thr: int) -> OptimizedResult:
+    ### get camera parameters K and T
+    K_b33: Float32[np.ndarray, "b 3 3"] = scene.get_intrinsics().numpy(force=True)
+    world_T_cam_b44: Float32[np.ndarray, "b 4 4"] = scene.get_im_poses().numpy(
+        force=True
+    )
+    ### image, confidence, depths
+    rgb_hw3_list: list[Float32[np.ndarray, "h w 3"]] = scene.imgs
+    depth_hw_list: list[Float32[np.ndarray, "h w"]] = [
+        depth.numpy(force=True) for depth in scene.get_depthmaps()
+    ]
+    # normalized depth
+    # depth_hw_list = [depth_hw / depth_hw.max() for depth_hw in depth_hw_list]
+    conf_hw_list: list[Float32[np.ndarray, "h w"]] = [
+        c.numpy(force=True) for c in scene.im_conf
+    ]
+    # normalize confidence
+    # conf_hw_list = [conf_hw / conf_hw.max() for conf_hw in conf_hw_list]
+    # point cloud, mesh
+    pts3d_list: list[Float32[np.ndarray, "h w 3"]] = [
+        pt3d.numpy(force=True) for pt3d in scene.get_pts3d()
+    ]
+    # get log confidence
+    log_conf_trf: Float32[torch.Tensor, ""] = scene.conf_trf(torch.tensor(min_conf_thr))
+    # set the minimum confidence threshold
+    scene.min_conf_thr = float(log_conf_trf)
+    masks_list: Bool[np.ndarray, "h w"] = [
+        mask.numpy(force=True) for mask in scene.get_masks()
+    ]
+    point_cloud: Float32[np.ndarray, "num_points 3"] = np.concatenate(
+        [p[m] for p, m in zip(pts3d_list, masks_list)]
+    )
+    colors: Float32[np.ndarray, "num_points 3"] = np.concatenate(
+        [p[m] for p, m in zip(rgb_hw3_list, masks_list)]
+    )
+    point_cloud = trimesh.PointCloud(
+        point_cloud.reshape(-1, 3), colors=colors.reshape(-1, 3)
+    )
+    meshes = []
+    pbar = tqdm(zip(rgb_hw3_list, pts3d_list, masks_list), total=len(rgb_hw3_list))
+    for rgb_hw3, pts3d, mask in pbar:
+        meshes.append(pts3d_to_trimesh(rgb_hw3, pts3d, mask))
+    mesh = trimesh.Trimesh(**cat_meshes(meshes))
+    optimised_result = OptimizedResult(
+        K_b33=K_b33,
+        world_T_cam_b44=world_T_cam_b44,
+        rgb_hw3_list=rgb_hw3_list,
+        depth_hw_list=depth_hw_list,
+        conf_hw_list=conf_hw_list,
+        masks_list=masks_list,
+        point_cloud=point_cloud,
+        mesh=mesh,
+    )
+    return optimised_result
+def inferece_dust3r(
+    image_dir_or_list: Path | list[Path],
+    model: AsymmetricCroCo3DStereo,
+    device: Literal["cpu", "cuda", "mps"],
+    batch_size: int = 1,
+    image_size: Literal[224, 512] = 512,
+    niter: int = 100,
+    schedule: Literal["linear", "cosine"] = "linear",
+    min_conf_thr: float = 10,
+) -> OptimizedResult:
+    """
+    Perform inference using the Dust3r algorithm.
+    Args:
+        image_dir_or_list (Union[Path, List[Path]]): Path to the directory containing images or a list of image paths.
+        model (AsymmetricCroCo3DStereo): The Dust3r model to use for inference.
+        device (Literal["cpu", "cuda", "mps"]): The device to use for inference ("cpu", "cuda", or "mps").
+        batch_size (int, optional): The batch size for inference. Defaults to 1.
+        image_size (Literal[224, 512], optional): The size of the input images. Defaults to 512.
+        niter (int, optional): The number of iterations for the global alignment optimization. Defaults to 100.
+        schedule (Literal["linear", "cosine"], optional): The learning rate schedule for the global alignment optimization. Defaults to "linear".
+        min_conf_thr (float, optional): The minimum confidence threshold for the optimized result. Defaults to 10.
+    Returns:
+        OptimizedResult: The optimized result containing the RGB, depth, and confidence images.
+    Raises:
+        ValueError: If `image_dir_or_list` is neither a list of paths nor a path.
+    """
+    if isinstance(image_dir_or_list, list):
+        imgs: list[ImageDict] = load_images(
+            folder_or_list=image_dir_or_list, size=image_size, verbose=True
+        )
+    elif isinstance(image_dir_or_list, Path):
+        imgs: list[ImageDict] = load_images(
+            folder_or_list=str(image_dir_or_list), size=image_size, verbose=True
+        )
+    else:
+        raise ValueError("image_dir_or_list should be a list of paths or a path")
+    # if only one image was loaded, duplicate it to feed into stereo network
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]["idx"] = 1
+    pairs: list[tuple[ImageDict, ImageDict]] = make_pairs(
+        imgs, scene_graph="complete", prefilter=None, symmetrize=True
+    )
+    output: Dust3rResult = inference(pairs, model, device, batch_size=batch_size)
+    mode = (
+        GlobalAlignerMode.PointCloudOptimizer
+        if len(imgs) > 2
+        else GlobalAlignerMode.PairViewer
+    )
+    scene: BasePCOptimizer = global_aligner(
+        dust3r_output=output, device=device, mode=mode
+    )
+    lr = 0.01
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        loss = scene.compute_global_alignment(
+            init="mst", niter=niter, schedule=schedule, lr=lr
+        )
+    # get the optimized result from the scene
+    optimized_result: OptimizedResult = scene_to_results(scene, min_conf_thr)
+    return optimized_result

mini_dust3r/cloud_opt/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# global alignment optimization wrapper function
+# --------------------------------------------------------
+from enum import Enum
+from .optimizer import PointCloudOptimizer
+from .modular_optimizer import ModularPointCloudOptimizer
+from .pair_viewer import PairViewer
+from mini_dust3r.inference import Dust3rResult
+from typing import Literal
+class GlobalAlignerMode(Enum):
+    PointCloudOptimizer = "PointCloudOptimizer"
+    ModularPointCloudOptimizer = "ModularPointCloudOptimizer"
+    PairViewer = "PairViewer"
+def global_aligner(
+    dust3r_output: Dust3rResult,
+    device: Literal["cpu", "cuda", "mps"],
+    mode: GlobalAlignerMode = GlobalAlignerMode.PointCloudOptimizer,
+    **optim_kw,
+):
+    # extract all inputs
+    view1, view2, pred1, pred2 = [
+        dust3r_output[k] for k in "view1 view2 pred1 pred2".split()
+    ]
+    # build the optimizer
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    elif mode == GlobalAlignerMode.ModularPointCloudOptimizer:
+        net = ModularPointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(
+            device
+        )
+    elif mode == GlobalAlignerMode.PairViewer:
+        net = PairViewer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    else:
+        raise NotImplementedError(f"Unknown mode {mode}")
+    return net

mini_dust3r/cloud_opt/base_opt.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class for the global alignement procedure
+# --------------------------------------------------------
+from copy import deepcopy
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+from mini_dust3r.utils.geometry import inv, geotrf
+from mini_dust3r.utils.device import to_numpy
+from mini_dust3r.utils.image import rgb
+from mini_dust3r.viz import SceneViz, segment_sky, auto_cam_size
+from mini_dust3r.optim_factory import adjust_learning_rate_by_lr
+from mini_dust3r.cloud_opt.commons import (edge_str, ALL_DISTS, NoGradParamDict, get_imshapes, signed_expm1, signed_log1p,
+                                      cosine_schedule, linear_schedule, get_conf_trf)
+import mini_dust3r.cloud_opt.init_im_poses as init_fun
+class BasePCOptimizer (nn.Module):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+    def __init__(self, *args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            other = deepcopy(args[0])
+            attrs = '''edges is_symmetrized dist n_imgs pred_i pred_j imshapes
+                        min_conf_thr conf_thr conf_i conf_j im_conf
+                        base_scale norm_pw_scale POSE_DIM pw_poses
+                        pw_adaptors pw_adaptors has_im_poses rand_pose imgs verbose'''.split()
+            self.__dict__.update({k: other[k] for k in attrs})
+        else:
+            self._init_from_views(*args, **kwargs)
+    def _init_from_views(self, view1, view2, pred1, pred2,
+                         dist='l1',
+                         conf='log',
+                         min_conf_thr=3,
+                         base_scale=0.5,
+                         allow_pw_adaptors=False,
+                         pw_break=20,
+                         rand_pose=torch.randn,
+                         iterationsCount=None,
+                         verbose=True):
+        super().__init__()
+        if not isinstance(view1['idx'], list):
+            view1['idx'] = view1['idx'].tolist()
+        if not isinstance(view2['idx'], list):
+            view2['idx'] = view2['idx'].tolist()
+        self.edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+        self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
+        self.dist = ALL_DISTS[dist]
+        self.verbose = verbose
+        self.n_imgs = self._check_edges()
+        # input data
+        pred1_pts = pred1['pts3d']
+        pred2_pts = pred2['pts3d_in_other_view']
+        self.pred_i = NoGradParamDict({ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.pred_j = NoGradParamDict({ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
+        # work in log-scale with conf
+        pred1_conf = pred1['conf']
+        pred2_conf = pred2['conf']
+        self.min_conf_thr = min_conf_thr
+        self.conf_trf = get_conf_trf(conf)
+        self.conf_i = NoGradParamDict({ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.conf_j = NoGradParamDict({ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
+        # pairwise pose parameters
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(rand_pose((self.n_edges, 1+self.POSE_DIM)))  # pairwise poses
+        self.pw_adaptors = nn.Parameter(torch.zeros((self.n_edges, 2)))  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+        # possibly store images for show_pointcloud
+        self.imgs = None
+        if 'img' in view1 and 'img' in view2:
+            imgs = [torch.zeros((3,)+hw) for hw in self.imshapes]
+            for v in range(len(self.edges)):
+                idx = view1['idx'][v]
+                imgs[idx] = view1['img'][v]
+                idx = view2['idx'][v]
+                imgs[idx] = view2['img'][v]
+            self.imgs = rgb(imgs)
+    @property
+    def n_edges(self):
+        return len(self.edges)
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+    @property
+    def imsizes(self):
+        return [(w, h) for h, w in self.imshapes]
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+    def state_dict(self, trainable=True):
+        all_params = super().state_dict()
+        return {k: v for k, v in all_params.items() if k.startswith(('_', 'pred_i.', 'pred_j.', 'conf_i.', 'conf_j.')) != trainable}
+    def load_state_dict(self, data):
+        return super().load_state_dict(self.state_dict(trainable=False) | data)
+    def _check_edges(self):
+        indices = sorted({i for edge in self.edges for i in edge})
+        assert indices == list(range(len(indices))), 'bad pair indices: missing values '
+        return len(indices)
+    @torch.no_grad()
+    def _compute_img_conf(self, pred1_conf, pred2_conf):
+        im_conf = nn.ParameterList([torch.zeros(hw, device=self.device) for hw in self.imshapes])
+        for e, (i, j) in enumerate(self.edges):
+            im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
+            im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
+        return im_conf
+    def get_adaptors(self):
+        adapt = self.pw_adaptors
+        adapt = torch.cat((adapt[:, 0:1], adapt), dim=-1)  # (scale_xy, scale_xy, scale_z)
+        if self.norm_pw_scale:  # normalize so that the product == 1
+            adapt = adapt - adapt.mean(dim=1, keepdim=True)
+        return (adapt / self.pw_break).exp()
+    def _get_poses(self, poses):
+        # normalize rotation
+        Q = poses[:, :4]
+        T = signed_expm1(poses[:, 4:7])
+        RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
+        return RT
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(T / (scale or 1))  # translation is function of scale
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+    def get_pw_scale(self):
+        scale = self.pw_poses[:, -1].exp()  # (n_edges,)
+        scale = scale * self.get_pw_norm_scale_factor()
+        return scale
+    def get_pw_poses(self):  # cam to world
+        RT = self._get_poses(self.pw_poses)
+        scaled_RT = RT.clone()
+        scaled_RT[:, :3] *= self.get_pw_scale().view(-1, 1, 1)  # scale the rotation AND translation
+        return scaled_RT
+    def get_masks(self):
+        return [(conf > self.min_conf_thr) for conf in self.im_conf]
+    def depth_to_pts3d(self):
+        raise NotImplementedError()
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def _set_focal(self, idx, focal, force=False):
+        raise NotImplementedError()
+    def get_focals(self):
+        raise NotImplementedError()
+    def get_known_focal_mask(self):
+        raise NotImplementedError()
+    def get_principal_points(self):
+        raise NotImplementedError()
+    def get_conf(self, mode=None):
+        trf = self.conf_trf if mode is None else get_conf_trf(mode)
+        return [trf(c) for c in self.im_conf]
+    def get_im_poses(self):
+        raise NotImplementedError()
+    def _set_depthmap(self, idx, depth, force=False):
+        raise NotImplementedError()
+    def get_depthmaps(self, raw=False):
+        raise NotImplementedError()
+    @torch.no_grad()
+    def clean_pointcloud(self, tol=0.001, max_bad_conf=0):
+        """ Method:
+        1) express all 3d points in each camera coordinate frame
+        2) if they're in front of a depthmap --> then lower their confidence
+        """
+        assert 0 <= tol < 1
+        cams = inv(self.get_im_poses())
+        K = self.get_intrinsics()
+        depthmaps = self.get_depthmaps()
+        res = deepcopy(self)
+        for i, pts3d in enumerate(self.depth_to_pts3d()):
+            for j in range(self.n_imgs):
+                if i == j:
+                    continue
+                # project 3dpts in other view
+                Hi, Wi = self.imshapes[i]
+                Hj, Wj = self.imshapes[j]
+                proj = geotrf(cams[j], pts3d[:Hi*Wi]).reshape(Hi, Wi, 3)
+                proj_depth = proj[:, :, 2]
+                u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
+                # check which points are actually in the visible cone
+                msk_i = (proj_depth > 0) & (0 <= u) & (u < Wj) & (0 <= v) & (v < Hj)
+                msk_j = v[msk_i], u[msk_i]
+                # find bad points = those in front but less confident
+                bad_points = (proj_depth[msk_i] < (1-tol) * depthmaps[j][msk_j]
+                              ) & (res.im_conf[i][msk_i] < res.im_conf[j][msk_j])
+                bad_msk_i = msk_i.clone()
+                bad_msk_i[msk_i] = bad_points
+                res.im_conf[i][bad_msk_i] = res.im_conf[i][bad_msk_i].clip_(max=max_bad_conf)
+        return res
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+        if ret_details:
+            return loss, details
+        return loss
+    @torch.cuda.amp.autocast(enabled=False)
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == 'msp' or init == 'mst':
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == 'known_poses':
+            init_fun.init_from_known_poses(self, min_conf_thr=self.min_conf_thr,
+                                           niter_PnP=niter_PnP)
+        else:
+            raise ValueError(f'bad value for {init=}')
+        return global_alignment_loop(self, **kw)
+    @torch.no_grad()
+    def mask_sky(self):
+        res = deepcopy(self)
+        for i in range(self.n_imgs):
+            sky = segment_sky(self.imgs[i])
+            res.im_conf[i][sky] = 0
+        return res
+    def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
+        viz = SceneViz()
+        if self.imgs is None:
+            colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
+            colors = list(map(tuple, colors.tolist()))
+            for n in range(self.n_imgs):
+                viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
+        else:
+            viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
+            colors = np.random.randint(256, size=(self.n_imgs, 3))
+        # camera poses
+        im_poses = to_numpy(self.get_im_poses())
+        if cam_size is None:
+            cam_size = auto_cam_size(im_poses)
+        viz.add_cameras(im_poses, self.get_focals(), colors=colors,
+                        images=self.imgs, imsizes=self.imsizes, cam_size=cam_size)
+        if show_pw_cams:
+            pw_poses = self.get_pw_poses()
+            viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
+            if show_pw_pts3d:
+                pts = [geotrf(pw_poses[e], self.pred_i[edge_str(i, j)]) for e, (i, j) in enumerate(self.edges)]
+                viz.add_pointcloud(pts, (128, 0, 128))
+        viz.show(**kw)
+        return viz
+def global_alignment_loop(net, lr=0.01, niter=300, schedule='cosine', lr_min=1e-6):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+    verbose = net.verbose
+    if verbose:
+        print('Global alignement - optimizing for:')
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+    loss = float('inf')
+    if verbose:
+        with tqdm.tqdm(total=niter) as bar:
+            while bar.n < bar.total:
+                loss = global_alignment_iter(net, bar.n, niter, lr_base, lr_min, optimizer, schedule)
+                bar.set_postfix_str(f'{lr=:g} loss={loss:g}')
+                bar.update()
+    else:
+        for n in range(niter):
+            loss = global_alignment_iter(net, n, niter, lr_base, lr_min, optimizer, schedule)
+    return loss
+def global_alignment_iter(net, cur_iter, niter, lr_base, lr_min, optimizer, schedule):
+    t = cur_iter / niter
+    if schedule == 'cosine':
+        lr = cosine_schedule(t, lr_base, lr_min)
+    elif schedule == 'linear':
+        lr = linear_schedule(t, lr_base, lr_min)
+    else:
+        raise ValueError(f'bad lr {schedule=}')
+    adjust_learning_rate_by_lr(optimizer, lr)
+    optimizer.zero_grad()
+    loss = net()
+    loss.backward()
+    optimizer.step()
+    return float(loss)

mini_dust3r/cloud_opt/commons.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+def edge_str(i, j):
+    return f'{i}_{j}'
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+def compute_edge_scores(edges, conf_i, conf_j):
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f'incorrect shape for image {i}'
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f'incorrect shape for image {j}'
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+def get_conf_trf(mode):
+    if mode == 'log':
+        def conf_trf(x): return x.log()
+    elif mode == 'sqrt':
+        def conf_trf(x): return x.sqrt()
+    elif mode == 'm1':
+        def conf_trf(x): return x-1
+    elif mode in ('id', 'none'):
+        def conf_trf(x): return x
+    else:
+        raise ValueError(f'bad mode for {mode=}')
+    return conf_trf
+def l2_dist(a, b, weight):
+    return ((a - b).square().sum(dim=-1) * weight)
+def l1_dist(a, b, weight):
+    return ((a - b).norm(dim=-1) * weight)
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1+np.cos(t * np.pi))/2
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t

mini_dust3r/cloud_opt/init_im_poses.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Initialization functions for global alignment
+# --------------------------------------------------------
+from functools import cache
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+from mini_dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
+from mini_dust3r.post_process import estimate_focal_knowing_depth
+from mini_dust3r.viz import to_numpy
+from mini_dust3r.cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
+@torch.no_grad()
+def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
+    device = self.device
+    # indices of known poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    assert nkp == self.n_imgs, 'not all poses are known'
+    # get all focals
+    nkf, _, im_focals = get_known_focals(self)
+    assert nkf == self.n_imgs
+    im_pp = self.get_principal_points()
+    best_depthmaps = {}
+    # init all pairwise poses
+    for e, (i, j) in enumerate(tqdm(self.edges, disable=not self.verbose)):
+        i_j = edge_str(i, j)
+        # find relative pose for this pair
+        P1 = torch.eye(4, device=device)
+        msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
+        _, P2 = fast_pnp(self.pred_j[i_j], float(im_focals[i].mean()),
+                         pp=im_pp[i], msk=msk, device=device, niter_PnP=niter_PnP)
+        # align the two predicted camera with the two gt cameras
+        s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
+        # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
+        # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+        # remember if this is a good depthmap
+        score = float(self.conf_i[i_j].mean())
+        if score > best_depthmaps.get(i, (0,))[0]:
+            best_depthmaps[i] = score, i_j, s
+    # init all image poses
+    for n in range(self.n_imgs):
+        assert known_poses_msk[n]
+        _, i_j, scale = best_depthmaps[n]
+        depth = self.pred_i[i_j][:, :, 2]
+        self._set_depthmap(n, depth * scale)
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """ Init all camera poses (image-wise and pairwise poses) given
+        an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(self.imshapes, self.edges,
+                                                          self.pred_i, self.pred_j, self.conf_i, self.conf_j, self.im_conf, self.min_conf_thr,
+                                                          device, has_im_poses=self.has_im_poses, verbose=self.verbose,
+                                                          **kw)
+    return init_from_pts3d(self, pts3d, im_focals, im_poses)
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    if nkp == 1:
+        raise NotImplementedError("Would be simpler to just align everything afterwards on the single known pose")
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(im_poses[known_poses_msk], known_poses[known_poses_msk])
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+    # set all pairwise poses
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        s, R, T = rigid_points_registration(self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+    # take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factor
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world)
+            if im_focals[i] is not None:
+                self._set_focal(i, im_focals[i])
+    if self.verbose:
+        print(' init loss =', float(self()))
+def minimum_spanning_tree(imshapes, edges, pred_i, pred_j, conf_i, conf_j, im_conf, min_conf_thr,
+                          device, has_im_poses=True, niter_PnP=10, verbose=True):
+    n_imgs = len(imshapes)
+    sparse_graph = -dict_to_sparse_graph(compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j))
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes)
+    todo = sorted(zip(-msp.data, msp.row, msp.col))  # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+    # init with strongest edge
+    score, i, j = todo.pop()
+    if verbose:
+        print(f' init edge ({i}*,{j}*) {score=}')
+    i_j = edge_str(i, j)
+    pts3d[i] = pred_i[i_j].clone()
+    pts3d[j] = pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses:
+        im_poses[i] = torch.eye(4, device=device)
+        im_focals[i] = estimate_focal(pred_i[i_j])
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop()
+        if im_focals[i] is None:
+            im_focals[i] = estimate_focal(pred_i[i_j])
+        if i in done:
+            if verbose:
+                print(f' init edge ({i},{j}*) {score=}')
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[j] = geotrf(trf, pred_j[i_j])
+            done.add(j)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        elif j in done:
+            if verbose:
+                print(f' init edge ({i}*,{j}) {score=}')
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, pred_i[i_j])
+            done.add(i)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(sparse_graph.values())  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[np.argsort(pair_scores)]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr
+                res = fast_pnp(pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP)
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+    return pts3d, msp_edges, im_focals, im_poses
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration(
+        pts1.reshape(-1, 3), pts2.reshape(-1, 3), weights=conf.ravel(), compute_scaling=True)
+    return s, R, T  # return un-scaled (R, T)
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device)
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W/2, H/2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode='weiszfeld').ravel()
+    return float(focal)
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S/2, S*3, 21)
+    else:
+        tentative_focals = [focal]
+    if pp is None:
+        pp = (W/2, H/2)
+    else:
+        pp = to_numpy(pp)
+    best = 0,
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+        success, R, T, inliers = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                                    iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+        if not success:
+            continue
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+    if not best[0]:
+        return None
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+def get_known_poses(self):
+    if self.has_im_poses:
+        known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
+        known_poses = self.get_im_poses()
+        return known_poses_msk.sum(), known_poses_msk, known_poses
+    else:
+        return 0, None, None
+def get_known_focals(self):
+    if self.has_im_poses:
+        known_focal_msk = self.get_known_focal_mask()
+        known_focals = self.get_focals()
+        return known_focal_msk.sum(), known_focal_msk, known_focals
+    else:
+        return 0, None, None
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+    def center_and_z(poses):
+        eps = get_med_dist_between_poses(poses) / 100
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps*poses[:, :3, 2]))
+    R, T, s = roma.rigid_points_registration(center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True)
+    return s, R, T

mini_dust3r/cloud_opt/modular_optimizer.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Slower implementation of the global alignment that allows to freeze partial poses/intrinsics
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
+from mini_dust3r.utils.geometry import geotrf
+from mini_dust3r.utils.device import to_cpu, to_numpy
+from mini_dust3r.utils.geometry import depthmap_to_pts3d
+class ModularPointCloudOptimizer (BasePCOptimizer):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Unlike PointCloudOptimizer, you can fix parts of the optimization process (partial poses/intrinsics)
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+    def __init__(self, *args, optimize_pp=False, fx_and_fy=False, focal_brake=20, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.has_im_poses = True  # by definition of this class
+        self.focal_brake = focal_brake
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
+        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
+        default_focals = [self.focal_brake * np.log(max(H, W)) for H, W in self.imshapes]
+        self.im_focals = nn.ParameterList(torch.FloatTensor([f, f] if fx_and_fy else [
+                                          f]) for f in default_focals)  # camera intrinsics
+        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            if self.verbose:
+                print(f' (setting pose #{idx} = {pose[:3,3]})')
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose), force=True))
+        # normalize scale if there's less than 1 known pose
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = (n_known_poses <= 1)
+    def preset_intrinsics(self, known_intrinsics, msk=None):
+        if isinstance(known_intrinsics, torch.Tensor) and known_intrinsics.ndim == 2:
+            known_intrinsics = [known_intrinsics]
+        for K in known_intrinsics:
+            assert K.shape == (3, 3)
+        self.preset_focal([K.diagonal()[:2].mean() for K in known_intrinsics], msk)
+        self.preset_principal_point([K[:2, 2] for K in known_intrinsics], msk)
+    def preset_focal(self, known_focals, msk=None):
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            if self.verbose:
+                print(f' (setting focal #{idx} = {focal})')
+            self._no_grad(self._set_focal(idx, focal, force=True))
+    def preset_principal_point(self, known_pp, msk=None):
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            if self.verbose:
+                print(f' (setting principal point #{idx} = {pp})')
+            self._no_grad(self._set_principal_point(idx, pp, force=True))
+    def _no_grad(self, tensor):
+        return tensor.requires_grad_(False)
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.where(msk)[0]
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f'bad {msk=}')
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = self.focal_brake * np.log(focal)
+        return param
+    def get_focals(self):
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_brake).exp()
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
+        return param
+    def get_principal_points(self):
+        return torch.stack([pp.new((W/2, H/2))+10*pp for pp, (H, W) in zip(self.im_pp, self.imshapes)])
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().view(self.n_imgs, -1)
+        K[:, 0, 0] = focals[:, 0]
+        K[:, 1, 1] = focals[:, -1]
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+    def get_im_poses(self):  # cam to world
+        cam2world = self._get_poses(torch.stack(list(self.im_poses)))
+        return cam2world
+    def _set_depthmap(self, idx, depth, force=False):
+        param = self.im_depthmaps[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)
+        return param
+    def get_depthmaps(self):
+        return [d.exp() for d in self.im_depthmaps]
+    def depth_to_pts3d(self):
+        # Get depths and  projection params if not provided
+        focals = self.get_focals()
+        pp = self.get_principal_points()
+        im_poses = self.get_im_poses()
+        depth = self.get_depthmaps()
+        # convert focal to (1,2,H,W) constant field
+        def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *self.imshapes[i])
+        # get pointmaps in camera frame
+        rel_ptmaps = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[i:i+1])[0] for i in range(im_poses.shape[0])]
+        # project to world frame
+        return [geotrf(pose, ptmap) for pose, ptmap in zip(im_poses, rel_ptmaps)]
+    def get_pts3d(self):
+        return self.depth_to_pts3d()

mini_dust3r/cloud_opt/optimizer.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Main class for the implementation of the global alignment
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
+from mini_dust3r.utils.geometry import xy_grid, geotrf
+from mini_dust3r.utils.device import to_cpu, to_numpy
+class PointCloudOptimizer(BasePCOptimizer):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+    def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.has_im_poses = True  # by definition of this class
+        self.focal_break = focal_break
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
+        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
+        self.im_focals = nn.ParameterList(torch.FloatTensor(
+            [self.focal_break*np.log(max(H, W))]) for H, W in self.imshapes)  # camera intrinsics
+        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+        self.imshape = self.imshapes[0]
+        im_areas = [h*w for h, w in self.imshapes]
+        self.max_area = max(im_areas)
+        # adding thing to optimize
+        self.im_depthmaps = ParameterStack(self.im_depthmaps, is_param=True, fill=self.max_area)
+        self.im_poses = ParameterStack(self.im_poses, is_param=True)
+        self.im_focals = ParameterStack(self.im_focals, is_param=True)
+        self.im_pp = ParameterStack(self.im_pp, is_param=True)
+        self.register_buffer('_pp', torch.tensor([(w/2, h/2) for h, w in self.imshapes]))
+        self.register_buffer('_grid', ParameterStack(
+            [xy_grid(W, H, device=self.device) for H, W in self.imshapes], fill=self.max_area))
+        # pre-compute pixel weights
+        self.register_buffer('_weight_i', ParameterStack(
+            [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges], fill=self.max_area))
+        self.register_buffer('_weight_j', ParameterStack(
+            [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges], fill=self.max_area))
+        # precompute aa
+        self.register_buffer('_stacked_pred_i', ParameterStack(self.pred_i, self.str_edges, fill=self.max_area))
+        self.register_buffer('_stacked_pred_j', ParameterStack(self.pred_j, self.str_edges, fill=self.max_area))
+        self.register_buffer('_ei', torch.tensor([i for i, j in self.edges]))
+        self.register_buffer('_ej', torch.tensor([j for i, j in self.edges]))
+        self.total_area_i = sum([im_areas[i] for i, j in self.edges])
+        self.total_area_j = sum([im_areas[j] for i, j in self.edges])
+    def _check_all_imgs_are_selected(self, msk):
+        assert np.all(self._get_msk_indices(msk) == np.arange(self.n_imgs)), 'incomplete mask!'
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        self._check_all_imgs_are_selected(pose_msk)
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            if self.verbose:
+                print(f' (setting pose #{idx} = {pose[:3,3]})')
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
+        # normalize scale if there's less than 1 known pose
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = (n_known_poses <= 1)
+        self.im_poses.requires_grad_(False)
+        self.norm_pw_scale = False
+    def preset_focal(self, known_focals, msk=None):
+        self._check_all_imgs_are_selected(msk)
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            if self.verbose:
+                print(f' (setting focal #{idx} = {focal})')
+            self._no_grad(self._set_focal(idx, focal))
+        self.im_focals.requires_grad_(False)
+    def preset_principal_point(self, known_pp, msk=None):
+        self._check_all_imgs_are_selected(msk)
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            if self.verbose:
+                print(f' (setting principal point #{idx} = {pp})')
+            self._no_grad(self._set_principal_point(idx, pp))
+        self.im_pp.requires_grad_(False)
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.where(msk)[0]
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f'bad {msk=}')
+    def _no_grad(self, tensor):
+        assert tensor.requires_grad, 'it must be True at this point, otherwise no modification occurs'
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = self.focal_break * np.log(focal)
+        return param
+    def get_focals(self):
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_break).exp()
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.im_focals])
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
+        return param
+    def get_principal_points(self):
+        return self._pp + 10 * self.im_pp
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().flatten()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+    def get_im_poses(self):  # cam to world
+        cam2world = self._get_poses(self.im_poses)
+        return cam2world
+    def _set_depthmap(self, idx, depth, force=False):
+        depth = _ravel_hw(depth, self.max_area)
+        param = self.im_depthmaps[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)
+        return param
+    def get_depthmaps(self, raw=False):
+        res = self.im_depthmaps.exp()
+        if not raw:
+            res = [dm[:h*w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def depth_to_pts3d(self):
+        # Get depths and  projection params if not provided
+        focals = self.get_focals()
+        pp = self.get_principal_points()
+        im_poses = self.get_im_poses()
+        depth = self.get_depthmaps(raw=True)
+        # get pointmaps in camera frame
+        rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp)
+        # project to world frame
+        return geotrf(im_poses, rel_ptmaps)
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def forward(self):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors().unsqueeze(1)
+        proj_pts3d = self.get_pts3d(raw=True)
+        # rotate pairwise prediction according to pw_poses
+        aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i)
+        aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
+        # compute the less
+        li = self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum() / self.total_area_i
+        lj = self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum() / self.total_area_j
+        return li + lj
+def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
+    pp = pp.unsqueeze(1)
+    focal = focal.unsqueeze(1)
+    assert focal.shape == (len(depth), 1, 1)
+    assert pp.shape == (len(depth), 1, 2)
+    assert pixel_grid.shape == depth.shape + (2,)
+    depth = depth.unsqueeze(-1)
+    return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1)
+def ParameterStack(params, keys=None, is_param=None, fill=0):
+    if keys is not None:
+        params = [params[k] for k in keys]
+    if fill > 0:
+        params = [_ravel_hw(p, fill) for p in params]
+    requires_grad = params[0].requires_grad
+    assert all(p.requires_grad == requires_grad for p in params)
+    params = torch.stack(list(params)).float().detach()
+    if is_param or requires_grad:
+        params = nn.Parameter(params)
+        params.requires_grad_(requires_grad)
+    return params
+def _ravel_hw(tensor, fill=0):
+    # ravel H,W
+    tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+    if len(tensor) < fill:
+        tensor = torch.cat((tensor, tensor.new_zeros((fill - len(tensor),)+tensor.shape[1:])))
+    return tensor
+def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
+    return minf*focal_base, maxf*focal_base
+def apply_mask(img, msk):
+    img = img.copy()
+    img[msk] = 0
+    return img

mini_dust3r/cloud_opt/pair_viewer.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dummy optimizer for visualizing pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+import cv2
+from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
+from mini_dust3r.utils.geometry import inv, geotrf, depthmap_to_absolute_camera_coordinates
+from mini_dust3r.cloud_opt.commons import edge_str
+from mini_dust3r.post_process import estimate_focal_knowing_depth
+class PairViewer (BasePCOptimizer):
+    """
+    This a Dummy Optimizer.
+    To use only when the goal is to visualize the results for a pair of images (with is_symmetrized)
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.is_symmetrized and self.n_edges == 2
+        self.has_im_poses = True
+        # compute all parameters directly from raw input
+        self.focals = []
+        self.pp = []
+        rel_poses = []
+        confs = []
+        for i in range(self.n_imgs):
+            conf = float(self.conf_i[edge_str(i, 1-i)].mean() * self.conf_j[edge_str(i, 1-i)].mean())
+            if self.verbose:
+                print(f'  - {conf=:.3} for edge {i}-{1-i}')
+            confs.append(conf)
+            H, W = self.imshapes[i]
+            pts3d = self.pred_i[edge_str(i, 1-i)]
+            pp = torch.tensor((W/2, H/2))
+            focal = float(estimate_focal_knowing_depth(pts3d[None], pp, focal_mode='weiszfeld'))
+            self.focals.append(focal)
+            self.pp.append(pp)
+            # estimate the pose of pts1 in image 2
+            pixels = np.mgrid[:W, :H].T.astype(np.float32)
+            pts3d = self.pred_j[edge_str(1-i, i)].numpy()
+            assert pts3d.shape[:2] == (H, W)
+            msk = self.get_masks()[i].numpy()
+            K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+            try:
+                res = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                         iterationsCount=100, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+                success, R, T, inliers = res
+                assert success
+                R = cv2.Rodrigues(R)[0]  # world to cam
+                pose = inv(np.r_[np.c_[R, T], [(0, 0, 0, 1)]])  # cam to world
+            except:
+                pose = np.eye(4)
+            rel_poses.append(torch.from_numpy(pose.astype(np.float32)))
+        # let's use the pair with the most confidence
+        if confs[0] > confs[1]:
+            # ptcloud is expressed in camera1
+            self.im_poses = [torch.eye(4), rel_poses[1]]  # I, cam2-to-cam1
+            self.depth = [self.pred_i['0_1'][..., 2], geotrf(inv(rel_poses[1]), self.pred_j['0_1'])[..., 2]]
+        else:
+            # ptcloud is expressed in camera2
+            self.im_poses = [rel_poses[0], torch.eye(4)]  # I, cam1-to-cam2
+            self.depth = [geotrf(inv(rel_poses[0]), self.pred_j['1_0'])[..., 2], self.pred_i['1_0'][..., 2]]
+        self.im_poses = nn.Parameter(torch.stack(self.im_poses, dim=0), requires_grad=False)
+        self.focals = nn.Parameter(torch.tensor(self.focals), requires_grad=False)
+        self.pp = nn.Parameter(torch.stack(self.pp, dim=0), requires_grad=False)
+        self.depth = nn.ParameterList(self.depth)
+        for p in self.parameters():
+            p.requires_grad = False
+    def _set_depthmap(self, idx, depth, force=False):
+        if self.verbose:
+            print('_set_depthmap is ignored in PairViewer')
+        return
+    def get_depthmaps(self, raw=False):
+        depth = [d.to(self.device) for d in self.depth]
+        return depth
+    def _set_focal(self, idx, focal, force=False):
+        self.focals[idx] = focal
+    def get_focals(self):
+        return self.focals
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.focals])
+    def get_principal_points(self):
+        return self.pp
+    def get_intrinsics(self):
+        focals = self.get_focals()
+        pps = self.get_principal_points()
+        K = torch.zeros((len(focals), 3, 3), device=self.device)
+        for i in range(len(focals)):
+            K[i, 0, 0] = K[i, 1, 1] = focals[i]
+            K[i, :2, 2] = pps[i]
+            K[i, 2, 2] = 1
+        return K
+    def get_im_poses(self):
+        return self.im_poses
+    def depth_to_pts3d(self):
+        pts3d = []
+        for d, intrinsics, im_pose in zip(self.depth, self.get_intrinsics(), self.get_im_poses()):
+            pts, _ = depthmap_to_absolute_camera_coordinates(d.cpu().numpy(),
+                                                             intrinsics.cpu().numpy(),
+                                                             im_pose.cpu().numpy())
+            pts3d.append(torch.from_numpy(pts).to(device=self.device))
+        return pts3d
+    def forward(self):
+        return float('nan')

mini_dust3r/croco/blocks.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# --------------------------------------------------------
+# Main encoder/decoder blocks
+# --------------------------------------------------------
+# References:
+# timm
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
+import torch
+import torch.nn as nn
+from itertools import repeat
+import collections.abc
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x, xpos):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        # q,k,v = qkv.unbind(2)  # make torchscript happy (cannot use tensor as tuple)
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, query, key, value, qpos, kpos):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+        q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+    def forward(self, x, y, xpos, ypos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        y_ = self.norm_y(y)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+# patch embedding
+class PositionGetter(object):
+    """ return positions of patches """
+    def __init__(self):
+        self.cache_positions = {}
+    def __call__(self, b, h, w, device):
+        if not (h,w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+        return pos
+class PatchEmbed(nn.Module):
+    """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        self.position_getter = PositionGetter()
+    def forward(self, x):
+        B, C, H, W = x.shape
+        torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+    def _init_weights(self):
+        w = self.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

mini_dust3r/croco/croco.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# --------------------------------------------------------
+# CroCo model during pretraining
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+from functools import partial
+from mini_dust3r.croco.blocks import Block, DecoderBlock, PatchEmbed
+from mini_dust3r.croco.pos_embed import get_2d_sincos_pos_embed, RoPE2D
+from mini_dust3r.croco.masking import RandomMask
+class CroCoNet(nn.Module):
+    def __init__(self,
+                 img_size=224,           # input image size
+                 patch_size=16,          # patch_size
+                 mask_ratio=0.9,         # ratios of masked tokens
+                 enc_embed_dim=768,      # encoder feature dimension
+                 enc_depth=12,           # encoder depth
+                 enc_num_heads=12,       # encoder number of heads in the transformer block
+                 dec_embed_dim=512,      # decoder feature dimension
+                 dec_depth=8,            # decoder depth
+                 dec_num_heads=16,       # decoder number of heads in the transformer block
+                 mlp_ratio=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 norm_im2_in_dec=True,   # whether to apply normalization of the 'memory' = (second image) in the decoder
+                 pos_embed='cosine',     # positional embedding (either cosine or RoPE100)
+                ):
+        super(CroCoNet, self).__init__()
+        # patch embeddings  (with initialization done as in MAE)
+        self._set_patch_embed(img_size, patch_size, enc_embed_dim)
+        # mask generations
+        self._set_mask_generator(self.patch_embed.num_patches, mask_ratio)
+        self.pos_embed = pos_embed
+        if pos_embed=='cosine':
+            # positional embedding of the encoder
+            enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+            self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float())
+            # positional embedding of the decoder
+            dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
+            self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float())
+            # pos embedding in each block
+            self.rope = None # nothing for cosine
+        elif pos_embed.startswith('RoPE'): # eg RoPE100
+            self.enc_pos_embed = None # nothing to add in the encoder with RoPE
+            self.dec_pos_embed = None # nothing to add in the decoder with RoPE
+            if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+            freq = float(pos_embed[len('RoPE'):])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            raise NotImplementedError('Unknown pos_embed '+pos_embed)
+        # transformer for the encoder
+        self.enc_depth = enc_depth
+        self.enc_embed_dim = enc_embed_dim
+        self.enc_blocks = nn.ModuleList([
+            Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope)
+            for i in range(enc_depth)])
+        self.enc_norm = norm_layer(enc_embed_dim)
+        # masked tokens
+        self._set_mask_token(dec_embed_dim)
+        # decoder
+        self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec)
+        # prediction head
+        self._set_prediction_head(dec_embed_dim, patch_size)
+        # initializer weights
+        self.initialize_weights()
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
+    def _set_mask_generator(self, num_patches, mask_ratio):
+        self.mask_generator = RandomMask(num_patches, mask_ratio)
+    def _set_mask_token(self, dec_embed_dim):
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
+    def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec):
+        self.dec_depth = dec_depth
+        self.dec_embed_dim = dec_embed_dim
+        # transfer from encoder to decoder
+        self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+        # transformer for the decoder
+        self.dec_blocks = nn.ModuleList([
+            DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope)
+            for i in range(dec_depth)])
+        # final norm layer
+        self.dec_norm = norm_layer(dec_embed_dim)
+    def _set_prediction_head(self, dec_embed_dim, patch_size):
+         self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
+    def initialize_weights(self):
+        # patch embed
+        self.patch_embed._init_weights()
+        # mask tokens
+        if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02)
+        # linears and layer norms
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def _encode_image(self, image, do_mask=False, return_all_blocks=False):
+        """
+        image has B x 3 x img_size x img_size
+        do_mask: whether to perform masking or not
+        return_all_blocks: if True, return the features at the end of every block
+                           instead of just the features from the last block (eg for some prediction heads)
+        """
+        # embed the image into patches  (x has size B x Npatches x C)
+        # and get position if each return patch (pos has size B x Npatches x 2)
+        x, pos = self.patch_embed(image)
+        # add positional embedding without cls token
+        if self.enc_pos_embed is not None:
+            x = x + self.enc_pos_embed[None,...]
+        # apply masking
+        B,N,C = x.size()
+        if do_mask:
+            masks = self.mask_generator(x)
+            x = x[~masks].view(B, -1, C)
+            posvis = pos[~masks].view(B, -1, 2)
+        else:
+            B,N,C = x.size()
+            masks = torch.zeros((B,N), dtype=bool)
+            posvis = pos
+        # now apply the transformer encoder and normalization
+        if return_all_blocks:
+            out = []
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+                out.append(x)
+            out[-1] = self.enc_norm(out[-1])
+            return out, pos, masks
+        else:
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+            x = self.enc_norm(x)
+            return x, pos, masks
+    def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
+        """
+        return_all_blocks: if True, return the features at the end of every block
+                           instead of just the features from the last block (eg for some prediction heads)
+        masks1 can be None => assume image1 fully visible
+        """
+        # encoder to decoder layer
+        visf1 = self.decoder_embed(feat1)
+        f2 = self.decoder_embed(feat2)
+        # append masked tokens to the sequence
+        B,Nenc,C = visf1.size()
+        if masks1 is None: # downstreams
+            f1_ = visf1
+        else: # pretraining
+            Ntotal = masks1.size(1)
+            f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
+            f1_[~masks1] = visf1.view(B * Nenc, C)
+        # add positional embedding
+        if self.dec_pos_embed is not None:
+            f1_ = f1_ + self.dec_pos_embed
+            f2 = f2 + self.dec_pos_embed
+        # apply Transformer blocks
+        out = f1_
+        out2 = f2
+        if return_all_blocks:
+            _out, out = out, []
+            for blk in self.dec_blocks:
+                _out, out2 = blk(_out, out2, pos1, pos2)
+                out.append(_out)
+            out[-1] = self.dec_norm(out[-1])
+        else:
+            for blk in self.dec_blocks:
+                out, out2 = blk(out, out2, pos1, pos2)
+            out = self.dec_norm(out)
+        return out
+    def patchify(self, imgs):
+        """
+        imgs: (B, 3, H, W)
+        x: (B, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        return x
+    def unpatchify(self, x, channels=3):
+        """
+        x: (N, L, patch_size**2 *channels)
+        imgs: (N, 3, H, W)
+        """
+        patch_size = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1]**.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
+        return imgs
+    def forward(self, img1, img2):
+        """
+        img1: tensor of size B x 3 x img_size x img_size
+        img2: tensor of size B x 3 x img_size x img_size
+        out will be    B x N x (3*patch_size*patch_size)
+        masks are also returned as B x N just in case
+        """
+        # encoder of the masked first image
+        feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
+        # encoder of the second image
+        feat2, pos2, _ = self._encode_image(img2, do_mask=False)
+        # decoder
+        decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
+        # prediction head
+        out = self.prediction_head(decfeat)
+        # get target
+        target = self.patchify(img1)
+        return out, mask1, target

mini_dust3r/croco/dpt_block.py ADDED Viewed

	@@ -0,0 +1,450 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# --------------------------------------------------------
+# DPT head for ViTs
+# --------------------------------------------------------
+# References:
+# https://github.com/isl-org/DPT
+# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from typing import Union, Tuple, Iterable, List, Optional, Dict
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+def make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer_rn = nn.ModuleList([
+        scratch.layer1_rn,
+        scratch.layer2_rn,
+        scratch.layer3_rn,
+        scratch.layer4_rn,
+    ])
+    return scratch
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        width_ratio=1,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.width_ratio = width_ratio
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            if self.width_ratio != 1:
+                res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear')
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        if self.width_ratio != 1:
+            # and output.shape[3] < self.width_ratio * output.shape[2]
+            #size=(image.shape[])
+            if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
+                shape = 3 * output.shape[3]
+            else:
+                shape = int(self.width_ratio * 2 * output.shape[2])
+            output  = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear')
+        else:
+            output = nn.functional.interpolate(output, scale_factor=2,
+                    mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+def make_fusion_block(features, use_bn, width_ratio=1):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        width_ratio=width_ratio,
+    )
+class Interpolate(nn.Module):
+    """Interpolation module."""
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+        return x
+class DPTOutputAdapter(nn.Module):
+    """DPT output adapter.
+    :param num_cahnnels: Number of output channels
+    :param stride_level: tride level compared to the full-sized image.
+        E.g. 4 for 1/4th the size of the image.
+    :param patch_size_full: Int or tuple of the patch size over the full image size.
+        Patch size for smaller inputs will be computed accordingly.
+    :param hooks: Index of intermediate layers
+    :param layer_dims: Dimension of intermediate layers
+    :param feature_dim: Feature dimension
+    :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
+    :param use_bn: If set to True, activates batch norm
+    :param dim_tokens_enc:  Dimension of tokens coming from encoder
+    """
+    def __init__(self,
+                 num_channels: int = 1,
+                 stride_level: int = 1,
+                 patch_size: Union[int, Tuple[int, int]] = 16,
+                 main_tasks: Iterable[str] = ('rgb',),
+                 hooks: List[int] = [2, 5, 8, 11],
+                 layer_dims: List[int] = [96, 192, 384, 768],
+                 feature_dim: int = 256,
+                 last_dim: int = 32,
+                 use_bn: bool = False,
+                 dim_tokens_enc: Optional[int] = None,
+                 head_type: str = 'regression',
+                 output_width_ratio=1,
+                 **kwargs):
+        super().__init__()
+        self.num_channels = num_channels
+        self.stride_level = stride_level
+        self.patch_size = pair(patch_size)
+        self.main_tasks = main_tasks
+        self.hooks = hooks
+        self.layer_dims = layer_dims
+        self.feature_dim = feature_dim
+        self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None
+        self.head_type = head_type
+        # Actual patch height and width, taking into account stride of input
+        self.P_H = max(1, self.patch_size[0] // stride_level)
+        self.P_W = max(1, self.patch_size[1] // stride_level)
+        self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
+        self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        if self.head_type == 'regression':
+            # The "DPTDepthModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+                nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0)
+            )
+        elif self.head_type == 'semseg':
+            # The "DPTSegmentationModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False),
+                nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
+                nn.ReLU(True),
+                nn.Dropout(0.1, False),
+                nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            )
+        else:
+            raise ValueError('DPT head_type must be "regression" or "semseg".')
+        if self.dim_tokens_enc is not None:
+            self.init(dim_tokens_enc=dim_tokens_enc)
+    def init(self, dim_tokens_enc=768):
+        """
+        Initialize parts of decoder that are dependent on dimension of encoder tokens.
+        Should be called when setting up MultiMAE.
+        :param dim_tokens_enc: Dimension of tokens coming from encoder
+        """
+        #print(dim_tokens_enc)
+        # Set up activation postprocessing layers
+        if isinstance(dim_tokens_enc, int):
+            dim_tokens_enc = 4 * [dim_tokens_enc]
+        self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
+        self.act_1_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=4, stride=4, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+        self.act_2_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=2, stride=2, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+        self.act_3_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[2],
+                out_channels=self.layer_dims[2],
+                kernel_size=1, stride=1, padding=0,
+            )
+        )
+        self.act_4_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.Conv2d(
+                in_channels=self.layer_dims[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=3, stride=2, padding=1,
+            )
+        )
+        self.act_postprocess = nn.ModuleList([
+            self.act_1_postprocess,
+            self.act_2_postprocess,
+            self.act_3_postprocess,
+            self.act_4_postprocess
+        ])
+    def adapt_tokens(self, encoder_tokens):
+        # Adapt tokens
+        x = []
+        x.append(encoder_tokens[:, :])
+        x = torch.cat(x, dim=-1)
+        return x
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size):
+            #input_info: Dict):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        H, W = image_size
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+        # Output head
+        out = self.head(path_1)
+        return out

mini_dust3r/croco/masking.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# --------------------------------------------------------
+# Masking utils
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+class RandomMask(nn.Module):
+    """
+    random masking
+    """
+    def __init__(self, num_patches, mask_ratio):
+        super().__init__()
+        self.num_patches = num_patches
+        self.num_mask = int(mask_ratio * self.num_patches)
+    def __call__(self, x):
+        noise = torch.rand(x.size(0), self.num_patches, device=x.device)
+        argsort = torch.argsort(noise, dim=1)
+        return argsort < self.num_mask

mini_dust3r/croco/pos_embed.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token>0:
+        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+try:
+    from mini_dust3r.croco.curope import cuRoPE2D
+    RoPE2D = cuRoPE2D
+except ImportError:
+    print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+    class RoPE2D(torch.nn.Module):
+        def __init__(self, freq=100.0, F0=1.0):
+            super().__init__()
+            self.base = freq
+            self.F0 = F0
+            self.cache = {}
+        def get_cos_sin(self, D, seq_len, device, dtype):
+            if (D,seq_len,device,dtype) not in self.cache:
+                inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+                freqs = torch.cat((freqs, freqs), dim=-1)
+                cos = freqs.cos() # (Seq, Dim)
+                sin = freqs.sin()
+                self.cache[D,seq_len,device,dtype] = (cos,sin)
+            return self.cache[D,seq_len,device,dtype]
+        @staticmethod
+        def rotate_half(x):
+            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+        def apply_rope1d(self, tokens, pos1d, cos, sin):
+            assert pos1d.ndim==2
+            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+            return (tokens * cos) + (self.rotate_half(tokens) * sin)
+        def forward(self, tokens, positions):
+            """
+            input:
+                * tokens: batch_size x nheads x ntokens x dim
+                * positions: batch_size x ntokens x 2 (y and x position of each token)
+            output:
+                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+            """
+            assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+            D = tokens.size(3) // 2
+            assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+            cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+            # split features into two along the feature dimension, and apply rope1d on each half
+            y, x = tokens.chunk(2, dim=-1)
+            y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+            x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+            tokens = torch.cat((y, x), dim=-1)
+            return tokens

mini_dust3r/heads/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# head factory
+# --------------------------------------------------------
+from .linear_head import LinearPts3d
+from .dpt_head import create_dpt_head
+def head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder
+    """
+    if head_type == 'linear' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf)
+    elif head_type == 'dpt' and output_mode == 'pts3d':
+        return create_dpt_head(net, has_conf=has_conf)
+    else:
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")

mini_dust3r/heads/dpt_head.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dpt head implementation for DUST3R
+# Downstream heads assume inputs of size B x N x C (where N is the number of tokens) ;
+# or if it takes as input the output at every layer, the attribute return_all_layers should be set to True
+# the forward function also takes as input a dictionnary img_info with key "height" and "width"
+# for PixelwiseTask, the output will be of dimension B x num_channels x H x W
+# --------------------------------------------------------
+from einops import rearrange
+from typing import List
+import torch
+import torch.nn as nn
+from mini_dust3r.heads.postprocess import postprocess
+from mini_dust3r.croco.dpt_block import DPTOutputAdapter
+class DPTOutputAdapter_fix(DPTOutputAdapter):
+    """
+    Adapt croco's DPTOutputAdapter implementation for dust3r:
+    remove duplicated weigths, and fix forward for dust3r
+    """
+    def init(self, dim_tokens_enc=768):
+        super().init(dim_tokens_enc)
+        # these are duplicated weights
+        del self.act_1_postprocess
+        del self.act_2_postprocess
+        del self.act_3_postprocess
+        del self.act_4_postprocess
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size=None):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        # H, W = input_info['image_size']
+        image_size = self.image_size if image_size is None else image_size
+        H, W = image_size
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])[:, :, :layers[2].shape[2], :layers[2].shape[3]]
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+        # Output head
+        out = self.head(path_1)
+        return out
+class PixelwiseTaskWithDPT(nn.Module):
+    """ DPT module for dust3r, can return 3D points + confidence for all pixels"""
+    def __init__(self, *, n_cls_token=0, hooks_idx=None, dim_tokens=None,
+                 output_width_ratio=1, num_channels=1, postprocess=None, depth_mode=None, conf_mode=None, **kwargs):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_layers = True  # backbone needs to return all layers
+        self.postprocess = postprocess
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        assert n_cls_token == 0, "Not implemented"
+        dpt_args = dict(output_width_ratio=output_width_ratio,
+                        num_channels=num_channels,
+                        **kwargs)
+        if hooks_idx is not None:
+            dpt_args.update(hooks=hooks_idx)
+        self.dpt = DPTOutputAdapter_fix(**dpt_args)
+        dpt_init_args = {} if dim_tokens is None else {'dim_tokens_enc': dim_tokens}
+        self.dpt.init(**dpt_init_args)
+    def forward(self, x, img_info):
+        out = self.dpt(x, image_size=(img_info[0], img_info[1]))
+        if self.postprocess:
+            out = self.postprocess(out, self.depth_mode, self.conf_mode)
+        return out
+def create_dpt_head(net, has_conf=False):
+    """
+    return PixelwiseTaskWithDPT for given net params
+    """
+    assert net.dec_depth > 9
+    l2 = net.dec_depth
+    feature_dim = 256
+    last_dim = feature_dim//2
+    out_nchan = 3
+    ed = net.enc_embed_dim
+    dd = net.dec_embed_dim
+    return PixelwiseTaskWithDPT(num_channels=out_nchan + has_conf,
+                                feature_dim=feature_dim,
+                                last_dim=last_dim,
+                                hooks_idx=[0, l2*2//4, l2*3//4, l2],
+                                dim_tokens=[ed, dd, dd, dd],
+                                postprocess=postprocess,
+                                depth_mode=net.depth_mode,
+                                conf_mode=net.conf_mode,
+                                head_type='regression')

mini_dust3r/heads/linear_head.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# linear head implementation for DUST3R
+# --------------------------------------------------------
+import torch.nn as nn
+import torch.nn.functional as F
+from mini_dust3r.heads.postprocess import postprocess
+class LinearPts3d (nn.Module):
+    """
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+    def __init__(self, net, has_conf=False):
+        super().__init__()
+        self.patch_size = net.patch_embed.patch_size[0]
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.has_conf = has_conf
+        self.proj = nn.Linear(net.dec_embed_dim, (3 + has_conf)*self.patch_size**2)
+    def setup(self, croconet):
+        pass
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+        # extract 3D points
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size)
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+        # permute + norm depth
+        return postprocess(feat, self.depth_mode, self.conf_mode)

mini_dust3r/heads/postprocess.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# post process function for all heads: extract 3D points/confidence from output
+# --------------------------------------------------------
+import torch
+def postprocess(out, depth_mode, conf_mode):
+    """
+    extract 3D points/confidence from prediction head output
+    """
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,3
+    res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode))
+    if conf_mode is not None:
+        res['conf'] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode)
+    return res
+def reg_dense_depth(xyz, mode):
+    """
+    extract 3D points from prediction head output
+    """
+    mode, vmin, vmax = mode
+    no_bounds = (vmin == -float('inf')) and (vmax == float('inf'))
+    assert no_bounds
+    if mode == 'linear':
+        if no_bounds:
+            return xyz  # [-inf, +inf]
+        return xyz.clip(min=vmin, max=vmax)
+    # distance to origin
+    d = xyz.norm(dim=-1, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+    if mode == 'square':
+        return xyz * d.square()
+    if mode == 'exp':
+        return xyz * torch.expm1(d)
+    raise ValueError(f'bad {mode=}')
+def reg_dense_conf(x, mode):
+    """
+    extract confidence from prediction head output
+    """
+    mode, vmin, vmax = mode
+    if mode == 'exp':
+        return vmin + x.exp().clip(max=vmax-vmin)
+    if mode == 'sigmoid':
+        return (vmax - vmin) * torch.sigmoid(x) + vmin
+    raise ValueError(f'bad {mode=}')

mini_dust3r/image_pairs.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed to load image pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+from mini_dust3r.utils.image import ImageDict
+def make_pairs(
+    imgs: list[ImageDict],
+    scene_graph: str = "complete",
+    prefilter=None,
+    symmetrize=True,
+) -> list[tuple[ImageDict, ImageDict]]:
+    pairs = []
+    if scene_graph == "complete":  # complete graph
+        for i in range(len(imgs)):
+            for j in range(i):
+                pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith("swin"):
+        winsize = int(scene_graph.split("-")[1]) if "-" in scene_graph else 3
+        pairsid = set()
+        for i in range(len(imgs)):
+            for j in range(1, winsize + 1):
+                idx = (i + j) % len(imgs)  # explicit loop closure
+                pairsid.add((i, idx) if i < idx else (idx, i))
+        for i, j in pairsid:
+            pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith("oneref"):
+        refid = int(scene_graph.split("-")[1]) if "-" in scene_graph else 0
+        for j in range(len(imgs)):
+            if j != refid:
+                pairs.append((imgs[refid], imgs[j]))
+    if symmetrize:
+        pairs += [(img2, img1) for img1, img2 in pairs]
+    # now, remove edges
+    if isinstance(prefilter, str) and prefilter.startswith("seq"):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]))
+    if isinstance(prefilter, str) and prefilter.startswith("cyc"):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]), cyclic=True)
+    return pairs
+def sel(x, kept):
+    if isinstance(x, dict):
+        return {k: sel(v, kept) for k, v in x.items()}
+    if isinstance(x, (torch.Tensor, np.ndarray)):
+        return x[kept]
+    if isinstance(x, (tuple, list)):
+        return type(x)([x[k] for k in kept])
+def _filter_edges_seq(edges, seq_dis_thr, cyclic=False):
+    # number of images
+    n = max(max(e) for e in edges) + 1
+    kept = []
+    for e, (i, j) in enumerate(edges):
+        dis = abs(i - j)
+        if cyclic:
+            dis = min(dis, abs(i + n - j), abs(i - n - j))
+        if dis <= seq_dis_thr:
+            kept.append(e)
+    return kept
+def filter_pairs_seq(pairs, seq_dis_thr, cyclic=False):
+    edges = [(img1["idx"], img2["idx"]) for img1, img2 in pairs]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    return [pairs[i] for i in kept]
+def filter_edges_seq(view1, view2, pred1, pred2, seq_dis_thr, cyclic=False):
+    edges = [(int(i), int(j)) for i, j in zip(view1["idx"], view2["idx"])]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    print(
+        f">> Filtering edges more than {seq_dis_thr} frames apart: kept {len(kept)}/{len(edges)} edges"
+    )
+    return sel(view1, kept), sel(view2, kept), sel(pred1, kept), sel(pred2, kept)

mini_dust3r/inference.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed for the inference
+# --------------------------------------------------------
+import tqdm
+import torch
+from mini_dust3r.utils.device import to_cpu, collate_with_cat
+from mini_dust3r.utils.misc import invalid_to_nans
+from mini_dust3r.utils.geometry import depthmap_to_pts3d, geotrf
+from mini_dust3r.utils.image import ImageDict
+from mini_dust3r.model import AsymmetricCroCo3DStereo
+from typing import Literal, TypedDict, Optional
+from jaxtyping import Float32
+class Dust3rPred1(TypedDict):
+    pts3d: Float32[torch.Tensor, "b h w c"]
+    conf: Float32[torch.Tensor, "b h w"]
+class Dust3rPred2(TypedDict):
+    pts3d_in_other_view: Float32[torch.Tensor, "b h w c"]
+    conf: Float32[torch.Tensor, "b h w"]
+class Dust3rResult(TypedDict):
+    view1: ImageDict
+    view2: ImageDict
+    pred1: Dust3rPred1
+    pred2: Dust3rPred2
+    loss: Optional[int]
+def _interleave_imgs(img1, img2):
+    res = {}
+    for key, value1 in img1.items():
+        value2 = img2[key]
+        if isinstance(value1, torch.Tensor):
+            value = torch.stack((value1, value2), dim=1).flatten(0, 1)
+        else:
+            value = [x for pair in zip(value1, value2) for x in pair]
+        res[key] = value
+    return res
+def make_batch_symmetric(batch):
+    view1, view2 = batch
+    view1, view2 = (_interleave_imgs(view1, view2), _interleave_imgs(view2, view1))
+    return view1, view2
+def loss_of_one_batch(
+    batch, model, criterion, device, symmetrize_batch=False, use_amp=False, ret=None
+):
+    view1, view2 = batch
+    for view in batch:
+        for name in (
+            "img pts3d valid_mask camera_pose camera_intrinsics F_matrix corres".split()
+        ):  # pseudo_focal
+            if name not in view:
+                continue
+            view[name] = view[name].to(device, non_blocking=True)
+    if symmetrize_batch:
+        view1, view2 = make_batch_symmetric(batch)
+    with torch.cuda.amp.autocast(enabled=bool(use_amp)):
+        pred1, pred2 = model(view1, view2)
+        # loss is supposed to be symmetric
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = (
+                criterion(view1, view2, pred1, pred2) if criterion is not None else None
+            )
+    result = dict(view1=view1, view2=view2, pred1=pred1, pred2=pred2, loss=loss)
+    return result[ret] if ret else result
+@torch.no_grad()
+def inference(
+    pairs: list[tuple[ImageDict, ImageDict]],
+    model: AsymmetricCroCo3DStereo,
+    device: Literal["cpu", "cuda", "mps"],
+    batch_size: int = 8,
+    verbose: bool = True,
+) -> Dust3rResult:
+    if verbose:
+        print(f">> Inference with model on {len(pairs)} image pairs")
+    result = []
+    # first, check if all images have the same size
+    multiple_shapes = not (check_if_same_size(pairs))
+    if multiple_shapes:  # force bs=1
+        batch_size = 1
+    for i in tqdm.trange(0, len(pairs), batch_size, disable=not verbose):
+        res: Dust3rResult = loss_of_one_batch(
+            collate_with_cat(pairs[i : i + batch_size]), model, None, device
+        )
+        result.append(to_cpu(res))
+    result = collate_with_cat(result, lists=multiple_shapes)
+    return result
+def check_if_same_size(pairs):
+    shapes1 = [img1["img"].shape[-2:] for img1, img2 in pairs]
+    shapes2 = [img2["img"].shape[-2:] for img1, img2 in pairs]
+    return all(shapes1[0] == s for s in shapes1) and all(
+        shapes2[0] == s for s in shapes2
+    )
+def get_pred_pts3d(gt, pred, use_pose=False):
+    if "depth" in pred and "pseudo_focal" in pred:
+        try:
+            pp = gt["camera_intrinsics"][..., :2, 2]
+        except KeyError:
+            pp = None
+        pts3d = depthmap_to_pts3d(**pred, pp=pp)
+    elif "pts3d" in pred:
+        # pts3d from my camera
+        pts3d = pred["pts3d"]
+    elif "pts3d_in_other_view" in pred:
+        # pts3d from the other camera, already transformed
+        assert use_pose is True
+        return pred["pts3d_in_other_view"]  # return!
+    if use_pose:
+        camera_pose = pred.get("camera_pose")
+        assert camera_pose is not None
+        pts3d = geotrf(camera_pose, pts3d)
+    return pts3d
+def find_opt_scaling(
+    gt_pts1,
+    gt_pts2,
+    pr_pts1,
+    pr_pts2=None,
+    fit_mode="weiszfeld_stop_grad",
+    valid1=None,
+    valid2=None,
+):
+    assert gt_pts1.ndim == pr_pts1.ndim == 4
+    assert gt_pts1.shape == pr_pts1.shape
+    if gt_pts2 is not None:
+        assert gt_pts2.ndim == pr_pts2.ndim == 4
+        assert gt_pts2.shape == pr_pts2.shape
+    # concat the pointcloud
+    nan_gt_pts1 = invalid_to_nans(gt_pts1, valid1).flatten(1, 2)
+    nan_gt_pts2 = (
+        invalid_to_nans(gt_pts2, valid2).flatten(1, 2) if gt_pts2 is not None else None
+    )
+    pr_pts1 = invalid_to_nans(pr_pts1, valid1).flatten(1, 2)
+    pr_pts2 = (
+        invalid_to_nans(pr_pts2, valid2).flatten(1, 2) if pr_pts2 is not None else None
+    )
+    all_gt = (
+        torch.cat((nan_gt_pts1, nan_gt_pts2), dim=1)
+        if gt_pts2 is not None
+        else nan_gt_pts1
+    )
+    all_pr = torch.cat((pr_pts1, pr_pts2), dim=1) if pr_pts2 is not None else pr_pts1
+    dot_gt_pr = (all_pr * all_gt).sum(dim=-1)
+    dot_gt_gt = all_gt.square().sum(dim=-1)
+    if fit_mode.startswith("avg"):
+        # scaling = (all_pr / all_gt).view(B, -1).mean(dim=1)
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+    elif fit_mode.startswith("median"):
+        scaling = (dot_gt_pr / dot_gt_gt).nanmedian(dim=1).values
+    elif fit_mode.startswith("weiszfeld"):
+        # init scaling with l2 closed form
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (all_pr - scaling.view(-1, 1, 1) * all_gt).norm(dim=-1)
+            # print(dis.nanmean(-1))
+            w = dis.clip_(min=1e-8).reciprocal()
+            # update the scaling with the new weights
+            scaling = (w * dot_gt_pr).nanmean(dim=1) / (w * dot_gt_gt).nanmean(dim=1)
+    else:
+        raise ValueError(f"bad {fit_mode=}")
+    if fit_mode.endswith("stop_grad"):
+        scaling = scaling.detach()
+    scaling = scaling.clip(min=1e-3)
+    # assert scaling.isfinite().all(), bb()
+    return scaling

mini_dust3r/model.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUSt3R model class
+# --------------------------------------------------------
+from copy import deepcopy
+import torch
+import os
+from packaging import version
+import huggingface_hub
+from .utils.misc import (
+    fill_default_args,
+    freeze_all_params,
+    is_symmetrized,
+    interleave,
+    transpose_to_landscape,
+)
+from .heads import head_factory
+from mini_dust3r.patch_embed import get_patch_embed
+from mini_dust3r.croco.croco import CroCoNet
+inf = float("inf")
+hf_version_number = huggingface_hub.__version__
+assert version.parse(hf_version_number) >= version.parse(
+    "0.22.0"
+), "Outdated huggingface_hub version, please reinstall requirements.txt"
+def load_model(model_path, device, verbose=True):
+    if verbose:
+        print("... loading model from", model_path)
+    ckpt = torch.load(model_path, map_location="cpu")
+    args = ckpt["args"].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R")
+    if "landscape_only" not in args:
+        args = args[:-1] + ", landscape_only=False)"
+    else:
+        args = args.replace(" ", "").replace(
+            "landscape_only=True", "landscape_only=False"
+        )
+    assert "landscape_only=False" in args
+    if verbose:
+        print(f"instantiating : {args}")
+    net = eval(args)
+    s = net.load_state_dict(ckpt["model"], strict=False)
+    if verbose:
+        print(s)
+    return net.to(device)
+class AsymmetricCroCo3DStereo(
+    CroCoNet,
+    huggingface_hub.PyTorchModelHubMixin,
+    library_name="dust3r",
+    repo_url="https://github.com/naver/dust3r",
+    tags=["image-to-3d"],
+):
+    """Two siamese encoders, followed by two decoders.
+    The goal is to output 3d points directly, both images in view1's frame
+    (hence the asymmetry).
+    """
+    def __init__(
+        self,
+        output_mode="pts3d",
+        head_type="linear",
+        depth_mode=("exp", -inf, inf),
+        conf_mode=("exp", 1, inf),
+        freeze="none",
+        landscape_only=True,
+        patch_embed_cls="PatchEmbedDust3R",  # PatchEmbedDust3R or ManyAR_PatchEmbed
+        **croco_kwargs,
+    ):
+        self.patch_embed_cls = patch_embed_cls
+        self.croco_args = fill_default_args(croco_kwargs, super().__init__)
+        super().__init__(**croco_kwargs)
+        # dust3r specific initialization
+        self.dec_blocks2 = deepcopy(self.dec_blocks)
+        self.set_downstream_head(
+            output_mode,
+            head_type,
+            landscape_only,
+            depth_mode,
+            conf_mode,
+            **croco_kwargs,
+        )
+        self.set_freeze(freeze)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kw):
+        if os.path.isfile(pretrained_model_name_or_path):
+            return load_model(pretrained_model_name_or_path, device="cpu")
+        else:
+            return super(AsymmetricCroCo3DStereo, cls).from_pretrained(
+                pretrained_model_name_or_path, **kw
+            )
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = get_patch_embed(
+            self.patch_embed_cls, img_size, patch_size, enc_embed_dim
+        )
+    def load_state_dict(self, ckpt, **kw):
+        # duplicate all weights for the second decoder if not present
+        new_ckpt = dict(ckpt)
+        if not any(k.startswith("dec_blocks2") for k in ckpt):
+            for key, value in ckpt.items():
+                if key.startswith("dec_blocks"):
+                    new_ckpt[key.replace("dec_blocks", "dec_blocks2")] = value
+        return super().load_state_dict(new_ckpt, **kw)
+    def set_freeze(self, freeze):  # this is for use by downstream models
+        self.freeze = freeze
+        to_be_frozen = {
+            "none": [],
+            "mask": [self.mask_token],
+            "encoder": [self.mask_token, self.patch_embed, self.enc_blocks],
+        }
+        freeze_all_params(to_be_frozen[freeze])
+    def _set_prediction_head(self, *args, **kwargs):
+        """No prediction head"""
+        return
+    def set_downstream_head(
+        self,
+        output_mode,
+        head_type,
+        landscape_only,
+        depth_mode,
+        conf_mode,
+        patch_size,
+        img_size,
+        **kw,
+    ):
+        assert (
+            img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0
+        ), f"{img_size=} must be multiple of {patch_size=}"
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        # allocate heads
+        self.downstream_head1 = head_factory(
+            head_type, output_mode, self, has_conf=bool(conf_mode)
+        )
+        self.downstream_head2 = head_factory(
+            head_type, output_mode, self, has_conf=bool(conf_mode)
+        )
+        # magic wrapper
+        self.head1 = transpose_to_landscape(
+            self.downstream_head1, activate=landscape_only
+        )
+        self.head2 = transpose_to_landscape(
+            self.downstream_head2, activate=landscape_only
+        )
+    def _encode_image(self, image, true_shape):
+        # embed the image into patches  (x has size B x Npatches x C)
+        x, pos = self.patch_embed(image, true_shape=true_shape)
+        # add positional embedding without cls token
+        assert self.enc_pos_embed is None
+        # now apply the transformer encoder and normalization
+        for blk in self.enc_blocks:
+            x = blk(x, pos)
+        x = self.enc_norm(x)
+        return x, pos, None
+    def _encode_image_pairs(self, img1, img2, true_shape1, true_shape2):
+        if img1.shape[-2:] == img2.shape[-2:]:
+            out, pos, _ = self._encode_image(
+                torch.cat((img1, img2), dim=0),
+                torch.cat((true_shape1, true_shape2), dim=0),
+            )
+            out, out2 = out.chunk(2, dim=0)
+            pos, pos2 = pos.chunk(2, dim=0)
+        else:
+            out, pos, _ = self._encode_image(img1, true_shape1)
+            out2, pos2, _ = self._encode_image(img2, true_shape2)
+        return out, out2, pos, pos2
+    def _encode_symmetrized(self, view1, view2):
+        img1 = view1["img"]
+        img2 = view2["img"]
+        B = img1.shape[0]
+        # Recover true_shape when available, otherwise assume that the img shape is the true one
+        shape1 = view1.get(
+            "true_shape", torch.tensor(img1.shape[-2:])[None].repeat(B, 1)
+        )
+        shape2 = view2.get(
+            "true_shape", torch.tensor(img2.shape[-2:])[None].repeat(B, 1)
+        )
+        # warning! maybe the images have different portrait/landscape orientations
+        if is_symmetrized(view1, view2):
+            # computing half of forward pass!'
+            feat1, feat2, pos1, pos2 = self._encode_image_pairs(
+                img1[::2], img2[::2], shape1[::2], shape2[::2]
+            )
+            feat1, feat2 = interleave(feat1, feat2)
+            pos1, pos2 = interleave(pos1, pos2)
+        else:
+            feat1, feat2, pos1, pos2 = self._encode_image_pairs(
+                img1, img2, shape1, shape2
+            )
+        return (shape1, shape2), (feat1, feat2), (pos1, pos2)
+    def _decoder(self, f1, pos1, f2, pos2):
+        final_output = [(f1, f2)]  # before projection
+        # project to decoder dim
+        f1 = self.decoder_embed(f1)
+        f2 = self.decoder_embed(f2)
+        final_output.append((f1, f2))
+        for blk1, blk2 in zip(self.dec_blocks, self.dec_blocks2):
+            # img1 side
+            f1, _ = blk1(*final_output[-1][::+1], pos1, pos2)
+            # img2 side
+            f2, _ = blk2(*final_output[-1][::-1], pos2, pos1)
+            # store the result
+            final_output.append((f1, f2))
+        # normalize last output
+        del final_output[1]  # duplicate with final_output[0]
+        final_output[-1] = tuple(map(self.dec_norm, final_output[-1]))
+        return zip(*final_output)
+    def _downstream_head(self, head_num, decout, img_shape):
+        B, S, D = decout[-1].shape
+        # img_shape = tuple(map(int, img_shape))
+        head = getattr(self, f"head{head_num}")
+        return head(decout, img_shape)
+    def forward(self, view1, view2):
+        # encode the two images --> B,S,D
+        (shape1, shape2), (feat1, feat2), (pos1, pos2) = self._encode_symmetrized(
+            view1, view2
+        )
+        # combine all ref images into object-centric representation
+        dec1, dec2 = self._decoder(feat1, pos1, feat2, pos2)
+        with torch.cuda.amp.autocast(enabled=False):
+            res1 = self._downstream_head(1, [tok.float() for tok in dec1], shape1)
+            res2 = self._downstream_head(2, [tok.float() for tok in dec2], shape2)
+        res2["pts3d_in_other_view"] = res2.pop(
+            "pts3d"
+        )  # predict view2's pts3d in view1's frame
+        return res1, res2

mini_dust3r/optim_factory.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# optimization functions
+# --------------------------------------------------------
+def adjust_learning_rate_by_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr

mini_dust3r/patch_embed.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# PatchEmbed implementation for DUST3R,
+# in particular ManyAR_PatchEmbed that Handle images with non-square aspect ratio
+# --------------------------------------------------------
+import torch
+from mini_dust3r.croco.blocks import PatchEmbed
+def get_patch_embed(patch_embed_cls, img_size, patch_size, enc_embed_dim):
+    assert patch_embed_cls in ['PatchEmbedDust3R', 'ManyAR_PatchEmbed']
+    patch_embed = eval(patch_embed_cls)(img_size, patch_size, 3, enc_embed_dim)
+    return patch_embed
+class PatchEmbedDust3R(PatchEmbed):
+    def forward(self, x, **kw):
+        B, C, H, W = x.shape
+        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+class ManyAR_PatchEmbed (PatchEmbed):
+    """ Handle images with non-square aspect ratio.
+        All images in the same batch have the same aspect ratio.
+        true_shape = [(height, width) ...] indicates the actual shape of each image.
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        self.embed_dim = embed_dim
+        super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten)
+    def forward(self, img, true_shape):
+        B, C, H, W = img.shape
+        assert W >= H, f'img should be in landscape mode, but got {W=} {H=}'
+        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        assert true_shape.shape == (B, 2), f"true_shape has the wrong shape={true_shape.shape}"
+        # size expressed in tokens
+        W //= self.patch_size[0]
+        H //= self.patch_size[1]
+        n_tokens = H * W
+        height, width = true_shape.T
+        is_landscape = (width >= height)
+        is_portrait = ~is_landscape
+        # allocate result
+        x = img.new_zeros((B, n_tokens, self.embed_dim))
+        pos = img.new_zeros((B, n_tokens, 2), dtype=torch.int64)
+        # linear projection, transposed if necessary
+        x[is_landscape] = self.proj(img[is_landscape]).permute(0, 2, 3, 1).flatten(1, 2).float()
+        x[is_portrait] = self.proj(img[is_portrait].swapaxes(-1, -2)).permute(0, 2, 3, 1).flatten(1, 2).float()
+        pos[is_landscape] = self.position_getter(1, H, W, pos.device)
+        pos[is_portrait] = self.position_getter(1, W, H, pos.device)
+        x = self.norm(x)
+        return x, pos

mini_dust3r/post_process.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities for interpreting the DUST3R output
+# --------------------------------------------------------
+import numpy as np
+import torch
+from mini_dust3r.utils.geometry import xy_grid
+def estimate_focal_knowing_depth(pts3d, pp, focal_mode='median', min_focal=0., max_focal=np.inf):
+    """ Reprojection method, for when the absolute depth is known:
+        1) estimate the camera focal using a robust estimator
+        2) reproject points onto true rays, minimizing a certain error
+    """
+    B, H, W, THREE = pts3d.shape
+    assert THREE == 3
+    # centered pixel grid
+    pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(-1, 1, 2)  # B,HW,2
+    pts3d = pts3d.flatten(1, 2)  # (B, HW, 3)
+    if focal_mode == 'median':
+        with torch.no_grad():
+            # direct estimation of focal
+            u, v = pixels.unbind(dim=-1)
+            x, y, z = pts3d.unbind(dim=-1)
+            fx_votes = (u * z) / x
+            fy_votes = (v * z) / y
+            # assume square pixels, hence same focal for X and Y
+            f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
+            focal = torch.nanmedian(f_votes, dim=-1).values
+    elif focal_mode == 'weiszfeld':
+        # init focal with l2 closed form
+        # we try to find focal = argmin Sum | pixel - focal * (x,y)/z|
+        xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(posinf=0, neginf=0)  # homogeneous (x,y,1)
+        dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
+        dot_xy_xy = xy_over_z.square().sum(dim=-1)
+        focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1)
+            # print(dis.nanmean(-1))
+            w = dis.clip(min=1e-8).reciprocal()
+            # update the scaling with the new weights
+            focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
+    else:
+        raise ValueError(f'bad {focal_mode=}')
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
+    focal = focal.clip(min=min_focal*focal_base, max=max_focal*focal_base)
+    # print(focal)
+    return focal

mini_dust3r/utils/device.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for DUSt3R
+# --------------------------------------------------------
+import numpy as np
+import torch
+def todevice(batch, device, callback=None, non_blocking=False):
+    ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    '''
+    if callback:
+        batch = callback(batch)
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+    x = batch
+    if device == 'numpy':
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+to_device = todevice  # alias
+def to_numpy(x): return todevice(x, 'numpy')
+def to_cpu(x): return todevice(x, 'cpu')
+def to_cuda(x): return todevice(x, 'cuda')
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem}
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return listify(whatever) if lists else torch.cat([torch.from_numpy(x) for x in whatever])
+        # otherwise, we just chain lists
+        return sum(whatever, T())
+def listify(elems):
+    return [x for e in elems for x in e]

mini_dust3r/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# geometry utilitary functions
+# --------------------------------------------------------
+import torch
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+from mini_dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
+from mini_dust3r.utils.device import to_numpy
+def xy_grid(W, H, device=None, origin=(0, 0), unsqueeze=None, cat_dim=-1, homogeneous=False, **arange_kw):
+    """ Output a (H,W,2) array of int32
+        with output[j,i,0] = i + origin[0]
+             output[j,i,1] = j + origin[1]
+    """
+    if device is None:
+        # numpy
+        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
+    else:
+        # torch
+        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
+        meshgrid, stack = torch.meshgrid, torch.stack
+        ones = lambda *a: torch.ones(*a, device=device)
+    tw, th = [arange(o, o+s, **arange_kw) for s, o in zip((W, H), origin)]
+    grid = meshgrid(tw, th, indexing='xy')
+    if homogeneous:
+        grid = grid + (ones((H, W)),)
+    if unsqueeze is not None:
+        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
+    if cat_dim is not None:
+        grid = stack(grid, cat_dim)
+    return grid
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """ Apply a geometric transformation to a list of 3-D points.
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+    # optimized code
+    if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and
+            Trf.ndim == 3 and pts.ndim == 4):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d+1:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d]
+        else:
+            raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}')
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim-2
+            assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+            if pts.ndim > Trf.ndim:
+                # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+                # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+                pts = pts[:, None, :]
+        if pts.shape[-1]+1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+def inv(mat):
+    """ Invert a torch or numpy matrix
+    """
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f'bad matrix type = {type(mat)}')
+def depthmap_to_pts3d(depth, pseudo_focal, pp=None, **_):
+    """
+    Args:
+        - depthmap (BxHxW array):
+        - pseudo_focal: [B,H,W] ; [B,2,H,W] or [B,1,H,W]
+    Returns:
+        pointmap of absolute coordinates (BxHxWx3 array)
+    """
+    if len(depth.shape) == 4:
+        B, H, W, n = depth.shape
+    else:
+        B, H, W = depth.shape
+        n = None
+    if len(pseudo_focal.shape) == 3:  # [B,H,W]
+        pseudo_focalx = pseudo_focaly = pseudo_focal
+    elif len(pseudo_focal.shape) == 4:  # [B,2,H,W] or [B,1,H,W]
+        pseudo_focalx = pseudo_focal[:, 0]
+        if pseudo_focal.shape[1] == 2:
+            pseudo_focaly = pseudo_focal[:, 1]
+        else:
+            pseudo_focaly = pseudo_focalx
+    else:
+        raise NotImplementedError("Error, unknown input focal shape format.")
+    assert pseudo_focalx.shape == depth.shape[:3]
+    assert pseudo_focaly.shape == depth.shape[:3]
+    grid_x, grid_y = xy_grid(W, H, cat_dim=0, device=depth.device)[:, None]
+    # set principal point
+    if pp is None:
+        grid_x = grid_x - (W-1)/2
+        grid_y = grid_y - (H-1)/2
+    else:
+        grid_x = grid_x.expand(B, -1, -1) - pp[:, 0, None, None]
+        grid_y = grid_y.expand(B, -1, -1) - pp[:, 1, None, None]
+    if n is None:
+        pts3d = torch.empty((B, H, W, 3), device=depth.device)
+        pts3d[..., 0] = depth * grid_x / pseudo_focalx
+        pts3d[..., 1] = depth * grid_y / pseudo_focaly
+        pts3d[..., 2] = depth
+    else:
+        pts3d = torch.empty((B, H, W, 3, n), device=depth.device)
+        pts3d[..., 0, :] = depth * (grid_x / pseudo_focalx)[..., None]
+        pts3d[..., 1, :] = depth * (grid_y / pseudo_focaly)[..., None]
+        pts3d[..., 2, :] = depth
+    return pts3d
+def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    camera_intrinsics = np.float32(camera_intrinsics)
+    H, W = depthmap.shape
+    # Compute 3D ray associated with each pixel
+    # Strong assumption: there are no skew terms
+    assert camera_intrinsics[0, 1] == 0.0
+    assert camera_intrinsics[1, 0] == 0.0
+    if pseudo_focal is None:
+        fu = camera_intrinsics[0, 0]
+        fv = camera_intrinsics[1, 1]
+    else:
+        assert pseudo_focal.shape == (H, W)
+        fu = fv = pseudo_focal
+    cu = camera_intrinsics[0, 2]
+    cv = camera_intrinsics[1, 2]
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+    z_cam = depthmap
+    x_cam = (u - cu) * z_cam / fu
+    y_cam = (v - cv) * z_cam / fv
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
+    # Mask for valid coordinates
+    valid_mask = (depthmap > 0.0)
+    return X_cam, valid_mask
+def depthmap_to_absolute_camera_coordinates(depthmap, camera_intrinsics, camera_pose, **kw):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+        - camera_pose: a 4x3 or 4x4 cam2world matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels."""
+    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
+    # R_cam2world = np.float32(camera_params["R_cam2world"])
+    # t_cam2world = np.float32(camera_params["t_cam2world"]).squeeze()
+    R_cam2world = camera_pose[:3, :3]
+    t_cam2world = camera_pose[:3, 3]
+    # Express in absolute coordinates (invalid depth values)
+    X_world = np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
+    return X_world, valid_mask
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+def normalize_pointcloud(pts1, pts2, norm_mode='avg_dis', valid1=None, valid2=None):
+    """ renorm pointmaps pts1, pts2 with norm_mode
+    """
+    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
+    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split('_')
+    if norm_mode == 'avg':
+        # gather all points together (joint normalization)
+        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
+        nan_pts2, nnz2 = invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == 'dis':
+            pass  # do nothing
+        elif dis_mode == 'log1p':
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == 'warp-log1p':
+            # actually warp input points before normalizing them
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H1, W1 = pts1.shape[1:-1]
+            pts1 = pts1 * warp_factor[:, :W1*H1].view(-1, H1, W1, 1)
+            if pts2 is not None:
+                H2, W2 = pts2.shape[1:-1]
+                pts2 = pts2 * warp_factor[:, W1*H1:].view(-1, H2, W2, 1)
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f'bad {dis_mode=}')
+        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
+    else:
+        # gather all points together (joint normalization)
+        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
+        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if norm_mode == 'avg':
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == 'median':
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == 'sqrt':
+            norm_factor = all_dis.sqrt().nanmean(dim=1)**2
+        else:
+            raise ValueError(f'bad {norm_mode=}')
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts1.ndim:
+        norm_factor.unsqueeze_(-1)
+    res = pts1 / norm_factor
+    if pts2 is not None:
+        res = (res, pts2 / norm_factor)
+    return res
+@torch.no_grad()
+def get_joint_pointcloud_depth(z1, z2, valid_mask1, valid_mask2=None, quantile=0.5):
+    # set invalid points to NaN
+    _z1 = invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
+    _z2 = invalid_to_nans(z2, valid_mask2).reshape(len(z2), -1) if z2 is not None else None
+    _z = torch.cat((_z1, _z2), dim=-1) if z2 is not None else _z1
+    # compute median depth overall (ignoring nans)
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_z, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_z, quantile, dim=-1)
+    return shift_z  # (B,)
+@torch.no_grad()
+def get_joint_pointcloud_center_scale(pts1, pts2, valid_mask1=None, valid_mask2=None, z_only=False, center=True):
+    # set invalid points to NaN
+    _pts1 = invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
+    _pts2 = invalid_to_nans(pts2, valid_mask2).reshape(len(pts2), -1, 3) if pts2 is not None else None
+    _pts = torch.cat((_pts1, _pts2), dim=1) if pts2 is not None else _pts1
+    # compute median center
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+    # compute median norm
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+def find_reciprocal_matches(P1, P2):
+    """
+    returns 3 values:
+    1 - reciprocal_in_P2: a boolean array of size P2.shape[0], a "True" value indicates a match
+    2 - nn2_in_P1: a int array of size P2.shape[0], it contains the indexes of the closest points in P1
+    3 - reciprocal_in_P2.sum(): the number of matches
+    """
+    tree1 = KDTree(P1)
+    tree2 = KDTree(P2)
+    _, nn1_in_P2 = tree2.query(P1, workers=8)
+    _, nn2_in_P1 = tree1.query(P2, workers=8)
+    reciprocal_in_P1 = (nn2_in_P1[nn1_in_P2] == np.arange(len(nn1_in_P2)))
+    reciprocal_in_P2 = (nn1_in_P2[nn2_in_P1] == np.arange(len(nn2_in_P1)))
+    assert reciprocal_in_P1.sum() == reciprocal_in_P2.sum()
+    return reciprocal_in_P2, nn2_in_P1, reciprocal_in_P2.sum()
+def get_med_dist_between_poses(poses):
+    from scipy.spatial.distance import pdist
+    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))

mini_dust3r/utils/image.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions about images (loading/converting...)
+# --------------------------------------------------------
+import os
+import torch
+import numpy as np
+import PIL.Image
+from PIL.ImageOps import exif_transpose
+import torchvision.transforms as tvf
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+from typing import Literal, TypedDict
+from jaxtyping import Float32, Int32
+try:
+    from pillow_heif import register_heif_opener  # noqa
+    register_heif_opener()
+    heif_support_enabled = True
+except ImportError:
+    heif_support_enabled = False
+ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+class ImageDict(TypedDict):
+    img: Float32[torch.Tensor, "b c h w"]
+    true_shape: tuple[int, int] | Int32[torch.Tensor, "b 2"]
+    idx: int | list[int]
+    instance: str | list[str]
+def imread_cv2(path, options=cv2.IMREAD_COLOR):
+    """Open an image or a depthmap with opencv-python."""
+    if path.endswith((".exr", "EXR")):
+        options = cv2.IMREAD_ANYDEPTH
+    img = cv2.imread(path, options)
+    if img is None:
+        raise IOError(f"Could not load image={path} with {options=}")
+    if img.ndim == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+def rgb(ftensor, true_shape=None):
+    if isinstance(ftensor, list):
+        return [rgb(x, true_shape=true_shape) for x in ftensor]
+    if isinstance(ftensor, torch.Tensor):
+        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
+    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
+        ftensor = ftensor.transpose(1, 2, 0)
+    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
+        ftensor = ftensor.transpose(0, 2, 3, 1)
+    if true_shape is not None:
+        H, W = true_shape
+        ftensor = ftensor[:H, :W]
+    if ftensor.dtype == np.uint8:
+        img = np.float32(ftensor) / 255
+    else:
+        img = (ftensor * 0.5) + 0.5
+    return img.clip(min=0, max=1)
+def _resize_pil_image(img, long_edge_size):
+    S = max(img.size)
+    if S > long_edge_size:
+        interp = PIL.Image.LANCZOS
+    elif S <= long_edge_size:
+        interp = PIL.Image.BICUBIC
+    new_size = tuple(int(round(x * long_edge_size / S)) for x in img.size)
+    return img.resize(new_size, interp)
+def load_images(
+    folder_or_list: str | list,
+    size: Literal[224, 512],
+    square_ok: bool = False,
+    verbose: bool = True,
+) -> list[ImageDict]:
+    """open and convert all images in a list or folder to proper input format for DUSt3R"""
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f">> Loading images from {folder_or_list}")
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f">> Loading a list of {len(folder_or_list)} images")
+        root, folder_content = "", folder_or_list
+    else:
+        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
+    supported_images_extensions = [".jpg", ".jpeg", ".png"]
+    if heif_support_enabled:
+        supported_images_extensions += [".heic", ".heif"]
+    supported_images_extensions = tuple(supported_images_extensions)
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
+        W1, H1 = img.size
+        if size == 224:
+            # resize short side to 224 (then crop)
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+            # resize long side to 512
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx - half, cy - half, cx + half, cy + half))
+        else:
+            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+        W2, H2 = img.size
+        if verbose:
+            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
+        imgs.append(
+            dict(
+                img=ImgNorm(img)[None],
+                true_shape=np.int32([img.size[::-1]]),
+                idx=len(imgs),
+                instance=str(len(imgs)),
+            )
+        )
+    assert imgs, "no images foud at " + root
+    if verbose:
+        print(f" (Found {len(imgs)} images)")
+    return imgs

mini_dust3r/utils/misc.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for DUSt3R
+# --------------------------------------------------------
+import torch
+def fill_default_args(kwargs, func):
+    import inspect  # a bit hacky but it works reliably
+    signature = inspect.signature(func)
+    for k, v in signature.parameters.items():
+        if v.default is inspect.Parameter.empty:
+            continue
+        kwargs.setdefault(k, v.default)
+    return kwargs
+def freeze_all_params(modules):
+    for module in modules:
+        try:
+            for n, param in module.named_parameters():
+                param.requires_grad = False
+        except AttributeError:
+            # module is directly a parameter
+            module.requires_grad = False
+def is_symmetrized(gt1, gt2):
+    x = gt1['instance']
+    y = gt2['instance']
+    if len(x) == len(y) and len(x) == 1:
+        return False  # special case of batchsize 1
+    ok = True
+    for i in range(0, len(x), 2):
+        ok = ok and (x[i] == y[i+1]) and (x[i+1] == y[i])
+    return ok
+def flip(tensor):
+    """ flip so that tensor[0::2] <=> tensor[1::2] """
+    return torch.stack((tensor[1::2], tensor[0::2]), dim=1).flatten(0, 1)
+def interleave(tensor1, tensor2):
+    res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1)
+    res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1)
+    return res1, res2
+def transpose_to_landscape(head, activate=True):
+    """ Predict in the correct aspect-ratio,
+        then transpose the result in landscape
+        and stack everything back together.
+    """
+    def wrapper_no(decout, true_shape):
+        B = len(true_shape)
+        assert true_shape[0:1].allclose(true_shape), 'true_shape must be all identical'
+        H, W = true_shape[0].cpu().tolist()
+        res = head(decout, (H, W))
+        return res
+    def wrapper_yes(decout, true_shape):
+        B = len(true_shape)
+        # by definition, the batch is in landscape mode so W >= H
+        H, W = int(true_shape.min()), int(true_shape.max())
+        height, width = true_shape.T
+        is_landscape = (width >= height)
+        is_portrait = ~is_landscape
+        # true_shape = true_shape.cpu()
+        if is_landscape.all():
+            return head(decout, (H, W))
+        if is_portrait.all():
+            return transposed(head(decout, (W, H)))
+        # batch is a mix of both portraint & landscape
+        def selout(ar): return [d[ar] for d in decout]
+        l_result = head(selout(is_landscape), (H, W))
+        p_result = transposed(head(selout(is_portrait),  (W, H)))
+        # allocate full result
+        result = {}
+        for k in l_result | p_result:
+            x = l_result[k].new(B, *l_result[k].shape[1:])
+            x[is_landscape] = l_result[k]
+            x[is_portrait] = p_result[k]
+            result[k] = x
+        return result
+    return wrapper_yes if activate else wrapper_no
+def transposed(dic):
+    return {k: v.swapaxes(1, 2) for k, v in dic.items()}
+def invalid_to_nans(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = float('nan')
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr
+def invalid_to_zeros(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = 0
+        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
+    else:
+        nnz = arr.numel() // len(arr) if len(arr) else 0  # number of point per image
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr, nnz

mini_dust3r/viz.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Visualization utilities using trimesh
+# --------------------------------------------------------
+import PIL.Image
+import numpy as np
+from scipy.spatial.transform import Rotation
+import torch
+from mini_dust3r.utils.geometry import geotrf, get_med_dist_between_poses
+from mini_dust3r.utils.device import to_numpy
+from mini_dust3r.utils.image import rgb
+try:
+    import trimesh
+except ImportError:
+    print('/!\\ module trimesh is not installed, cannot visualize results /!\\')
+def cat_3d(vecs):
+    if isinstance(vecs, (np.ndarray, torch.Tensor)):
+        vecs = [vecs]
+    return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)])
+def show_raw_pointcloud(pts3d, colors, point_size=2):
+    scene = trimesh.Scene()
+    pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors))
+    scene.add_geometry(pct)
+    scene.show(line_settings={'point_size': point_size})
+def pts3d_to_trimesh(img, pts3d, valid=None):
+    H, W, THREE = img.shape
+    assert THREE == 3
+    assert img.shape == pts3d.shape
+    vertices = pts3d.reshape(-1, 3)
+    # make squares: each pixel == 2 triangles
+    idx = np.arange(len(vertices)).reshape(H, W)
+    idx1 = idx[:-1, :-1].ravel()  # top-left corner
+    idx2 = idx[:-1, +1:].ravel()  # right-left corner
+    idx3 = idx[+1:, :-1].ravel()  # bottom-left corner
+    idx4 = idx[+1:, +1:].ravel()  # bottom-right corner
+    faces = np.concatenate((
+        np.c_[idx1, idx2, idx3],
+        np.c_[idx3, idx2, idx1],  # same triangle, but backward (cheap solution to cancel face culling)
+        np.c_[idx2, idx3, idx4],
+        np.c_[idx4, idx3, idx2],  # same triangle, but backward (cheap solution to cancel face culling)
+    ), axis=0)
+    # prepare triangle colors
+    face_colors = np.concatenate((
+        img[:-1, :-1].reshape(-1, 3),
+        img[:-1, :-1].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3)
+    ), axis=0)
+    # remove invalid faces
+    if valid is not None:
+        assert valid.shape == (H, W)
+        valid_idxs = valid.ravel()
+        valid_faces = valid_idxs[faces].all(axis=-1)
+        faces = faces[valid_faces]
+        face_colors = face_colors[valid_faces]
+    assert len(faces) == len(face_colors)
+    return dict(vertices=vertices, face_colors=face_colors, faces=faces)
+def cat_meshes(meshes):
+    vertices, faces, colors = zip(*[(m['vertices'], m['faces'], m['face_colors']) for m in meshes])
+    n_vertices = np.cumsum([0]+[len(v) for v in vertices])
+    for i in range(len(faces)):
+        faces[i][:] += n_vertices[i]
+    vertices = np.concatenate(vertices)
+    colors = np.concatenate(colors)
+    faces = np.concatenate(faces)
+    return dict(vertices=vertices, face_colors=colors, faces=faces)
+def show_duster_pairs(view1, view2, pred1, pred2):
+    import matplotlib.pyplot as pl
+    pl.ion()
+    for e in range(len(view1['instance'])):
+        i = view1['idx'][e]
+        j = view2['idx'][e]
+        img1 = rgb(view1['img'][e])
+        img2 = rgb(view2['img'][e])
+        conf1 = pred1['conf'][e].squeeze()
+        conf2 = pred2['conf'][e].squeeze()
+        score = conf1.mean()*conf2.mean()
+        print(f">> Showing pair #{e} {i}-{j} {score=:g}")
+        pl.clf()
+        pl.subplot(221).imshow(img1)
+        pl.subplot(223).imshow(img2)
+        pl.subplot(222).imshow(conf1, vmin=1, vmax=30)
+        pl.subplot(224).imshow(conf2, vmin=1, vmax=30)
+        pts1 = pred1['pts3d'][e]
+        pts2 = pred2['pts3d_in_other_view'][e]
+        pl.subplots_adjust(0, 0, 1, 1, 0, 0)
+        if input('show pointcloud? (y/n) ') == 'y':
+            show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5)
+def auto_cam_size(im_poses):
+    return 0.1 * get_med_dist_between_poses(im_poses)
+class SceneViz:
+    def __init__(self):
+        self.scene = trimesh.Scene()
+    def add_pointcloud(self, pts3d, color, mask=None):
+        pts3d = to_numpy(pts3d)
+        mask = to_numpy(mask)
+        if mask is None:
+            mask = [slice(None)] * len(pts3d)
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3))
+        if isinstance(color, (list, np.ndarray, torch.Tensor)):
+            color = to_numpy(color)
+            col = np.concatenate([p[m] for p, m in zip(color, mask)])
+            assert col.shape == pts.shape
+            pct.visual.vertex_colors = uint8(col.reshape(-1, 3))
+        else:
+            assert len(color) == 3
+            pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape)
+        self.scene.add_geometry(pct)
+        return self
+    def add_camera(self, pose_c2w, focal=None, color=(0, 0, 0), image=None, imsize=None, cam_size=0.03):
+        pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image))
+        add_scene_cam(self.scene, pose_c2w, color, image, focal, screen_width=cam_size)
+        return self
+    def add_cameras(self, poses, focals=None, images=None, imsizes=None, colors=None, **kw):
+        def get(arr, idx): return None if arr is None else arr[idx]
+        for i, pose_c2w in enumerate(poses):
+            self.add_camera(pose_c2w, get(focals, i), image=get(images, i),
+                            color=get(colors, i), imsize=get(imsizes, i), **kw)
+        return self
+    def show(self, point_size=2):
+        self.scene.show(line_settings={'point_size': point_size})
+def show_raw_pointcloud_with_cams(imgs, pts3d, mask, focals, cams2world,
+                                  point_size=2, cam_size=0.05, cam_color=None):
+    """ Visualization of a pointcloud with cameras
+        imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        focals = (N,) or N-size list of [focal, ...]
+        cams2world = (N,4,4) or N-size list of [(4,4), ...]
+    """
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+    scene = trimesh.Scene()
+    # full pointcloud
+    pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+    col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+    pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+    scene.add_geometry(pct)
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      imgs[i] if i < len(imgs) else None, focals[i], screen_width=cam_size)
+    scene.show(line_settings={'point_size': point_size})
+def add_scene_cam(scene, pose_c2w, edge_color, image=None, focal=None, imsize=None, screen_width=0.03):
+    if image is not None:
+        H, W, THREE = image.shape
+        assert THREE == 3
+        if image.dtype != np.uint8:
+            image = np.uint8(255*image)
+    elif imsize is not None:
+        W, H = imsize
+    elif focal is not None:
+        H = W = focal / 1.1
+    else:
+        H = W = 1
+    if focal is None:
+        focal = min(H, W) * 1.1  # default value
+    elif isinstance(focal, np.ndarray):
+        focal = focal[0]
+    # create fake camera
+    height = focal * screen_width / H
+    width = screen_width * 0.5**0.5
+    rot45 = np.eye(4)
+    rot45[:3, :3] = Rotation.from_euler('z', np.deg2rad(45)).as_matrix()
+    rot45[2, 3] = -height  # set the tip of the cone = optical center
+    aspect_ratio = np.eye(4)
+    aspect_ratio[0, 0] = W/H
+    transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45
+    cam = trimesh.creation.cone(width, height, sections=4)  # , transform=transform)
+    # this is the image
+    if image is not None:
+        vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]])
+        faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
+        img = trimesh.Trimesh(vertices=vertices, faces=faces)
+        uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
+        img.visual = trimesh.visual.TextureVisuals(uv_coords, image=PIL.Image.fromarray(image))
+        scene.add_geometry(img)
+    # this is the camera mesh
+    rot2 = np.eye(4)
+    rot2[:3, :3] = Rotation.from_euler('z', np.deg2rad(2)).as_matrix()
+    vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)]
+    vertices = geotrf(transform, vertices)
+    faces = []
+    for face in cam.faces:
+        if 0 in face:
+            continue
+        a, b, c = face
+        a2, b2, c2 = face + len(cam.vertices)
+        a3, b3, c3 = face + 2*len(cam.vertices)
+        # add 3 pseudo-edges
+        faces.append((a, b, b2))
+        faces.append((a, a2, c))
+        faces.append((c2, b, c))
+        faces.append((a, b, b3))
+        faces.append((a, a3, c))
+        faces.append((c3, b, c))
+    # no culling
+    faces += [(c, b, a) for a, b, c in faces]
+    cam = trimesh.Trimesh(vertices=vertices, faces=faces)
+    cam.visual.face_colors[:, :3] = edge_color
+    scene.add_geometry(cam)
+def cat(a, b):
+    return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3)))
+OPENGL = np.array([[1, 0, 0, 0],
+                   [0, -1, 0, 0],
+                   [0, 0, -1, 0],
+                   [0, 0, 0, 1]])
+CAM_COLORS = [(255, 0, 0), (0, 0, 255), (0, 255, 0), (255, 0, 255), (255, 204, 0), (0, 204, 204),
+              (128, 255, 255), (255, 128, 255), (255, 255, 128), (0, 0, 0), (128, 128, 128)]
+def uint8(colors):
+    if not isinstance(colors, np.ndarray):
+        colors = np.array(colors)
+    if np.issubdtype(colors.dtype, np.floating):
+        colors *= 255
+    assert 0 <= colors.min() and colors.max() < 256
+    return np.uint8(colors)
+def segment_sky(image):
+    import cv2
+    from scipy import ndimage
+    # Convert to HSV
+    image = to_numpy(image)
+    if np.issubdtype(image.dtype, np.floating):
+        image = np.uint8(255*image.clip(min=0, max=1))
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    # Define range for blue color and create mask
+    lower_blue = np.array([0, 0, 100])
+    upper_blue = np.array([30, 255, 255])
+    mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool)
+    # add luminous gray
+    mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150)
+    mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180)
+    mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220)
+    # Morphological operations
+    kernel = np.ones((5, 5), np.uint8)
+    mask2 = ndimage.binary_opening(mask, structure=kernel)
+    # keep only largest CC
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(mask2.view(np.uint8), connectivity=8)
+    cc_sizes = stats[1:, cv2.CC_STAT_AREA]
+    order = cc_sizes.argsort()[::-1]  # bigger first
+    i = 0
+    selection = []
+    while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2:
+        selection.append(1 + order[i])
+        i += 1
+    mask3 = np.in1d(labels, selection).reshape(labels.shape)
+    # Apply mask
+    return torch.from_numpy(mask3)