Spaces:
Running
Running
import gradio as gr | |
import cv2 | |
from PIL import Image | |
import numpy as np | |
from transformers import pipeline | |
import os | |
import torch | |
import torch.nn.functional as F | |
from torchvision import transforms | |
from torchvision.transforms import Compose | |
import trimesh | |
from geometry import create_triangles | |
import tempfile | |
from functools import partial | |
import spaces | |
from zipfile import ZipFile | |
from depth_anything.dpt import DepthAnything | |
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet | |
from moviepy.editor import * | |
def zip_files(files_in, files_out): | |
with ZipFile("depth_result.zip", "w") as zipObj: | |
for idx, file in enumerate(files_in): | |
zipObj.write(file, file.split("/")[-1]) | |
for idx, file in enumerate(files_out): | |
zipObj.write(file, file.split("/")[-1]) | |
return "depth_result.zip" | |
def create_video(frames, fps, type): | |
print("building video result") | |
clip = ImageSequenceClip(frames, fps=fps) | |
clip.write_videofile(type + "_result.mp4", fps=fps) | |
return type + "_result.mp4" | |
def predict_depth(model, image): | |
return model(image)["depth"] | |
def make_video(video_path, outdir='./vis_video_depth', encoder='vits'): | |
if encoder not in ["vitl","vitb","vits"]: | |
encoder = "vits" | |
mapper = {"vits":"small","vitb":"base","vitl":"large"} | |
# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
# model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval() | |
# Define path for temporary processed frames | |
temp_frame_dir = tempfile.mkdtemp() | |
margin_width = 50 | |
to_tensor_transform = transforms.ToTensor() | |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
# depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(encoder)).to(DEVICE).eval() | |
depth_anything = pipeline(task = "depth-estimation", model=f"nielsr/depth-anything-{mapper[encoder]}") | |
# total_params = sum(param.numel() for param in depth_anything.parameters()) | |
# print('Total parameters: {:.2f}M'.format(total_params / 1e6)) | |
transform = Compose([ | |
Resize( | |
width=518, | |
height=518, | |
resize_target=False, | |
keep_aspect_ratio=True, | |
ensure_multiple_of=14, | |
resize_method='lower_bound', | |
image_interpolation_method=cv2.INTER_CUBIC, | |
), | |
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | |
PrepareForNet(), | |
]) | |
if os.path.isfile(video_path): | |
if video_path.endswith('txt'): | |
with open(video_path, 'r') as f: | |
lines = f.read().splitlines() | |
else: | |
filenames = [video_path] | |
else: | |
filenames = os.listdir(video_path) | |
filenames = [os.path.join(video_path, filename) for filename in filenames if not filename.startswith('.')] | |
filenames.sort() | |
# os.makedirs(outdir, exist_ok=True) | |
for k, filename in enumerate(filenames): | |
file_size = os.path.getsize(filename)/1024/1024 | |
if file_size > 128.0: | |
print(f'File size of {filename} larger than 128Mb, sorry!') | |
return filename | |
print('Progress {:}/{:},'.format(k+1, len(filenames)), 'Processing', filename) | |
raw_video = cv2.VideoCapture(filename) | |
frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS)) | |
if frame_rate < 1: | |
frame_rate = 1 | |
cframes = int(raw_video.get(cv2.CAP_PROP_FRAME_COUNT)) | |
print(f'frames: {cframes}, fps: {frame_rate}') | |
# output_width = frame_width * 2 + margin_width | |
#filename = os.path.basename(filename) | |
# output_path = os.path.join(outdir, filename[:filename.rfind('.')] + '_video_depth.mp4') | |
#with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile: | |
# output_path = tmpfile.name | |
#out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"avc1"), frame_rate, (output_width, frame_height)) | |
#fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
#out = cv2.VideoWriter(output_path, fourcc, frame_rate, (output_width, frame_height)) | |
count=0 | |
depth_frames = [] | |
orig_frames = [] | |
while raw_video.isOpened(): | |
ret, raw_frame = raw_video.read() | |
if not ret: | |
break | |
frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0 | |
frame_pil = Image.fromarray((frame * 255).astype(np.uint8)) | |
frame = transform({'image': frame})['image'] | |
frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE) | |
depth = to_tensor_transform(predict_depth(depth_anything, frame_pil)) | |
depth = F.interpolate(depth[None], (frame_height, frame_width), mode='bilinear', align_corners=False)[0, 0] | |
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 | |
depth = depth.cpu().numpy().astype(np.uint8) | |
depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_BONE) | |
depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY) | |
depth_color = cv2.cvtColor(depth_gray, cv2.COLOR_GRAY2RGB) | |
# Remove white border around map: | |
# define lower and upper limits of white | |
white_lo = np.array([250,250,250]) | |
white_hi = np.array([255,255,255]) | |
# mask image to only select white | |
mask = cv2.inRange(depth_color, white_lo, white_hi) | |
# change image to black where we found white | |
depth_color[mask>0] = (0,0,0) | |
# split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255 | |
# combined_frame = cv2.hconcat([raw_frame, split_region, depth_color]) | |
# out.write(combined_frame) | |
# frame_path = os.path.join(temp_frame_dir, f"frame_{count:05d}.png") | |
# cv2.imwrite(frame_path, combined_frame) | |
cv2.imwrite(f"f{count}.jpg", raw_frame) | |
orig_frames.append(f"f{count}.jpg") | |
cv2.imwrite(f"f{count}_dmap.jpg", depth_color) | |
depth_frames.append(f"f{count}_dmap.jpg") | |
count += 1 | |
final_vid = create_video(depth_frames, frame_rate, "depth") | |
final_zip = zip_files(orig_frames, depth_frames) | |
raw_video.release() | |
# out.release() | |
cv2.destroyAllWindows() | |
return final_vid, final_zip, orig_frames[0], depth_frames[0] #output_path | |
def depth_edges_mask(depth): | |
"""Returns a mask of edges in the depth map. | |
Args: | |
depth: 2D numpy array of shape (H, W) with dtype float32. | |
Returns: | |
mask: 2D numpy array of shape (H, W) with dtype bool. | |
""" | |
# Compute the x and y gradients of the depth map. | |
depth_dx, depth_dy = np.gradient(depth) | |
# Compute the gradient magnitude. | |
depth_grad = np.sqrt(depth_dx ** 2 + depth_dy ** 2) | |
# Compute the edge mask. | |
mask = depth_grad > 0.05 | |
return mask | |
def pano_depth_to_world_points(depth, scale): | |
""" | |
360 depth to world points | |
given 2D depth is an equirectangular projection of a spherical image | |
Treat depth as radius | |
longitude : -pi to pi | |
latitude : -pi/2 to pi/2 | |
""" | |
# Convert depth to radius | |
radius = (255 - depth.flatten()) * scale | |
d_lon = (np.random.rand(depth.shape[0]*depth.shape[1]) - 0.5) * np.pi*2 / depth.shape[1] | |
d_lat = (np.random.rand(depth.shape[0]*depth.shape[1]) - 0.5) * np.pi / depth.shape[0] | |
lon = np.linspace(0, np.pi*2, depth.shape[1]) | |
lat = np.linspace(0, np.pi, depth.shape[0]) | |
lon, lat = np.meshgrid(lon, lat) | |
lon = lon.flatten() + d_lon | |
lat = lat.flatten() + d_lat | |
# Convert to cartesian coordinates | |
x = radius * np.cos(lon) * np.sin(lat) | |
y = radius * np.cos(lat) | |
z = radius * np.sin(lon) * np.sin(lat) | |
pts3d = np.stack([x, y, z], axis=1) | |
uv = np.stack([lon, lat], axis=1) | |
return [pts3d, uv] | |
def rgb2gray(rgb): | |
return np.dot(rgb[...,:3], [0.333, 0.333, 0.333]) | |
def get_mesh(image, depth, scale): | |
gdepth = rgb2gray(depth) | |
print('depth to gray - ok') | |
points = pano_depth_to_world_points(gdepth, scale) | |
pts3d = points[0] | |
uv = points[1] | |
print('radius from depth - ok') | |
# Create a trimesh mesh from the points | |
# Each pixel is connected to its 4 neighbors | |
# colors are the RGB values of the image | |
verts = pts3d.reshape(-1, 3) | |
#triangles = create_triangles(image.shape[0], image.shape[1]) | |
#print('triangles - ok') | |
rgba = cv2.cvtColor(image, cv2.COLOR_RGB2RGBA) | |
colors = rgba.reshape(-1, 4) | |
#mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors) | |
mesh = trimesh.PointCloud(verts, colors=colors) | |
#material = trimesh.visual.texture.SimpleMaterial(image=image) | |
#texture = trimesh.visual.TextureVisuals(uv=uv, image=image, material=material) | |
#mesh.visual = texture | |
scene = trimesh.Scene([mesh]) | |
print('mesh - ok') | |
# Save as glb | |
glb_file = tempfile.NamedTemporaryFile(suffix='.glb', delete=False) | |
glb_path = glb_file.name | |
scene.export(glb_path) | |
print('file - ok') | |
return glb_path | |
def loadurl(url): | |
return url | |
css = """ | |
#img-display-container { | |
max-height: 100vh; | |
} | |
#img-display-input { | |
max-height: 80vh; | |
} | |
#img-display-output { | |
max-height: 80vh; | |
} | |
""" | |
title = "# Depth Anything Video Demo" | |
description = """Depth Anything on full video files. | |
Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details. | |
Mesh rendering from [ZoeDepth](https://huggingface.co/spaces/shariqfarooq/ZoeDepth) ([github](https://github.com/isl-org/ZoeDepth/tree/main/ui)).""" | |
transform = Compose([ | |
Resize( | |
width=518, | |
height=518, | |
resize_target=False, | |
keep_aspect_ratio=True, | |
ensure_multiple_of=14, | |
resize_method='lower_bound', | |
image_interpolation_method=cv2.INTER_CUBIC, | |
), | |
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | |
PrepareForNet(), | |
]) | |
# @torch.no_grad() | |
# def predict_depth(model, image): | |
# return model(image) | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(title) | |
gr.Markdown(description) | |
gr.Markdown("### Video Depth Prediction demo") | |
with gr.Row(): | |
with gr.Column(): | |
input_url = gr.Textbox(value="./examples/streetview.mp4", label="URL") | |
input_video = gr.Video(label="Input Video", format="mp4") | |
input_url.change(fn=loadurl, inputs=[input_url], outputs=[input_video]) | |
submit = gr.Button("Submit") | |
output_frame = gr.Image(label="Frame", type='numpy') | |
output_depth = gr.Image(label="Depth", type='numpy') | |
scale = gr.Slider(label="Scale", minimum=0.03125, maximum=2, step=0.03125, value=0.5) | |
render = gr.Button("Render") | |
with gr.Column(): | |
model_type = gr.Dropdown([("small", "vits"), ("base", "vitb"), ("large", "vitl")], type="value", value="vits", label='Model Type') | |
processed_video = gr.Video(label="Output Video", format="mp4") | |
processed_zip = gr.File(label="Output Archive") | |
result = gr.Model3D(label="3D Mesh", clear_color=[0.0, 0.0, 0.0, 0.0], camera_position=[0, 90, 0]) | |
def on_submit(uploaded_video,model_type): | |
# Process the video and get the path of the output video | |
output_video_path = make_video(uploaded_video,encoder=model_type) | |
return output_video_path | |
submit.click(on_submit, inputs=[input_video, model_type], outputs=[processed_video, processed_zip, output_frame, output_depth]) | |
render.click(partial(get_mesh), inputs=[output_frame, output_depth, scale], outputs=[result]) | |
example_files = os.listdir('examples') | |
example_files.sort() | |
example_files = [os.path.join('examples', filename) for filename in example_files] | |
examples = gr.Examples(examples=example_files, inputs=[input_video], outputs=[processed_video, processed_zip, output_frame, output_depth], fn=on_submit, cache_examples=True) | |
if __name__ == '__main__': | |
demo.queue().launch() |