Spaces:
Running
Running
import gradio as gr | |
import cv2 | |
from PIL import Image | |
import numpy as np | |
from transformers import pipeline | |
import os | |
import torch | |
import torch.nn.functional as F | |
from torchvision import transforms | |
from torchvision.transforms import Compose | |
import trimesh | |
from geometry import create_triangles | |
import tempfile | |
from functools import partial | |
import spaces | |
from zipfile import ZipFile | |
from depth_anything.dpt import DepthAnything | |
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet | |
from moviepy.editor import * | |
frame_selected = 0 | |
def zip_files(files_in, files_out): | |
with ZipFile("depth_result.zip", "w") as zipObj: | |
for idx, file in enumerate(files_in): | |
zipObj.write(file, file.split("/")[-1]) | |
for idx, file in enumerate(files_out): | |
zipObj.write(file, file.split("/")[-1]) | |
return "depth_result.zip" | |
def create_video(frames, fps, type): | |
print("building video result") | |
clip = ImageSequenceClip(frames, fps=fps) | |
clip.write_videofile(type + "_result.mp4", fps=fps) | |
return type + "_result.mp4" | |
def predict_depth(model, image): | |
return model(image)["depth"] | |
def make_video(video_path, outdir='./vis_video_depth', encoder='vits'): | |
if encoder not in ["vitl","vitb","vits"]: | |
encoder = "vits" | |
mapper = {"vits":"small","vitb":"base","vitl":"large"} | |
# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
# model = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(DEVICE).eval() | |
# Define path for temporary processed frames | |
temp_frame_dir = tempfile.mkdtemp() | |
margin_width = 50 | |
to_tensor_transform = transforms.ToTensor() | |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
# depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(encoder)).to(DEVICE).eval() | |
depth_anything = pipeline(task = "depth-estimation", model=f"nielsr/depth-anything-{mapper[encoder]}") | |
# total_params = sum(param.numel() for param in depth_anything.parameters()) | |
# print('Total parameters: {:.2f}M'.format(total_params / 1e6)) | |
transform = Compose([ | |
Resize( | |
width=518, | |
height=518, | |
resize_target=False, | |
keep_aspect_ratio=True, | |
ensure_multiple_of=14, | |
resize_method='lower_bound', | |
image_interpolation_method=cv2.INTER_CUBIC, | |
), | |
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | |
PrepareForNet(), | |
]) | |
if os.path.isfile(video_path): | |
if video_path.endswith('txt'): | |
with open(video_path, 'r') as f: | |
lines = f.read().splitlines() | |
else: | |
filenames = [video_path] | |
else: | |
filenames = os.listdir(video_path) | |
filenames = [os.path.join(video_path, filename) for filename in filenames if not filename.startswith('.')] | |
filenames.sort() | |
# os.makedirs(outdir, exist_ok=True) | |
for k, filename in enumerate(filenames): | |
file_size = os.path.getsize(filename)/1024/1024 | |
if file_size > 128.0: | |
print(f'File size of {filename} larger than 128Mb, sorry!') | |
return filename | |
print('Progress {:}/{:},'.format(k+1, len(filenames)), 'Processing', filename) | |
raw_video = cv2.VideoCapture(filename) | |
frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS)) | |
if frame_rate < 1: | |
frame_rate = 1 | |
cframes = int(raw_video.get(cv2.CAP_PROP_FRAME_COUNT)) | |
print(f'frames: {cframes}, fps: {frame_rate}') | |
# output_width = frame_width * 2 + margin_width | |
#filename = os.path.basename(filename) | |
# output_path = os.path.join(outdir, filename[:filename.rfind('.')] + '_video_depth.mp4') | |
#with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile: | |
# output_path = tmpfile.name | |
#out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"avc1"), frame_rate, (output_width, frame_height)) | |
#fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
#out = cv2.VideoWriter(output_path, fourcc, frame_rate, (output_width, frame_height)) | |
count=0 | |
depth_frames = [] | |
orig_frames = [] | |
while raw_video.isOpened(): | |
ret, raw_frame = raw_video.read() | |
if not ret: | |
break | |
frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB) / 255.0 | |
frame_pil = Image.fromarray((frame * 255).astype(np.uint8)) | |
frame = transform({'image': frame})['image'] | |
frame = torch.from_numpy(frame).unsqueeze(0).to(DEVICE) | |
depth = to_tensor_transform(predict_depth(depth_anything, frame_pil)) | |
depth = F.interpolate(depth[None], (frame_height, frame_width), mode='bilinear', align_corners=False)[0, 0] | |
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 | |
depth = depth.cpu().numpy().astype(np.uint8) | |
depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_BONE) | |
depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGBA2GRAY) | |
depth_color = cv2.cvtColor(depth_gray, cv2.COLOR_GRAY2RGB) | |
# Remove white border around map: | |
# define lower and upper limits of white | |
white_lo = np.array([250,250,250]) | |
white_hi = np.array([255,255,255]) | |
# mask image to only select white | |
mask = cv2.inRange(depth_color, white_lo, white_hi) | |
# change image to black where we found white | |
depth_color[mask>0] = (0,0,0) | |
blur_frame = raw_frame.copy() | |
i = 240 | |
for j in range(1, 33): | |
blur_lo = np.array([i,i,i]) | |
blur_hi = np.array([i+16,i+16,i+16]) | |
blur_mask = cv2.inRange(depth_color, blur_lo, blur_hi) | |
print(f'kernel size {j}') | |
blur = cv2.GaussianBlur(raw_frame,(j,j),0) | |
blur_frame[blur_mask>0] = blur[blur_mask>0] | |
i = i - 16 | |
j = j + 2 | |
# split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255 | |
# combined_frame = cv2.hconcat([raw_frame, split_region, depth_color]) | |
# out.write(combined_frame) | |
# frame_path = os.path.join(temp_frame_dir, f"frame_{count:05d}.png") | |
# cv2.imwrite(frame_path, combined_frame) | |
cv2.imwrite(f"f{count}.jpg", blur_frame) | |
orig_frames.append(f"f{count}.jpg") | |
cv2.imwrite(f"f{count}_dmap.jpg", depth_color) | |
depth_frames.append(f"f{count}_dmap.jpg") | |
count += 1 | |
final_vid = create_video(depth_frames, frame_rate, "depth") | |
final_zip = zip_files(orig_frames, depth_frames) | |
raw_video.release() | |
# out.release() | |
cv2.destroyAllWindows() | |
return final_vid, final_zip, orig_frames, depth_frames #output_path | |
def depth_edges_mask(depth): | |
"""Returns a mask of edges in the depth map. | |
Args: | |
depth: 2D numpy array of shape (H, W) with dtype float32. | |
Returns: | |
mask: 2D numpy array of shape (H, W) with dtype bool. | |
""" | |
# Compute the x and y gradients of the depth map. | |
depth_dx, depth_dy = np.gradient(depth) | |
# Compute the gradient magnitude. | |
depth_grad = np.sqrt(depth_dx ** 2 + depth_dy ** 2) | |
# Compute the edge mask. | |
mask = depth_grad > 0.05 | |
return mask | |
def pano_depth_to_world_points(depth, scale): | |
""" | |
360 depth to world points | |
given 2D depth is an equirectangular projection of a spherical image | |
Treat depth as radius | |
longitude : -pi to pi | |
latitude : -pi/2 to pi/2 | |
""" | |
# Convert depth to radius | |
radius = (255 - depth.flatten()) * scale | |
lon = np.linspace(0, np.pi*2, depth.shape[1]) | |
lat = np.linspace(0, np.pi, depth.shape[0]) | |
lon, lat = np.meshgrid(lon, lat) | |
lon = lon.flatten() | |
lat = lat.flatten() | |
pts3d = [[0,0,0]] | |
uv = [[0,0]] | |
for i in range(0, 1): #(0,2) | |
for j in range(0, 1): #(0,2) | |
#rnd_lon = (np.random.rand(depth.shape[0]*depth.shape[1]) - 0.5) / 8 | |
#rnd_lat = (np.random.rand(depth.shape[0]*depth.shape[1]) - 0.5) / 8 | |
d_lon = lon + i/2 * np.pi*2 / depth.shape[1] | |
d_lat = lat + j/2 * np.pi / depth.shape[0] | |
# Convert to cartesian coordinates | |
x = radius * np.cos(d_lon) * np.sin(d_lat) | |
y = radius * np.cos(d_lat) | |
z = radius * np.sin(d_lon) * np.sin(d_lat) | |
pts = np.stack([x, y, z], axis=1) | |
uvs = np.stack([lon, lat], axis=1) | |
pts3d = np.concatenate((pts3d, pts), axis=0) | |
uv = np.concatenate((uv, uvs), axis=0) | |
#print(f'i: {i}, j: {j}') | |
j = j+1 | |
i = i+1 | |
return [pts3d, uv] | |
def rgb2gray(rgb): | |
return np.dot(rgb[...,:3], [0.333, 0.333, 0.333]) | |
def get_mesh(image, depth): | |
fnum = frame_selected | |
gdepth = rgb2gray(depth[fnum][0]) | |
print('depth to gray - ok') | |
points = pano_depth_to_world_points(gdepth, 1) | |
pts3d = points[0] | |
uv = points[1] | |
print('radius from depth - ok') | |
# Create a trimesh mesh from the points | |
# Each pixel is connected to its 4 neighbors | |
# colors are the RGB values of the image | |
verts = pts3d.reshape(-1, 3) | |
#triangles = create_triangles(image.shape[0], image.shape[1]) | |
#print('triangles - ok') | |
rgba = cv2.cvtColor(image[fnum][0], cv2.COLOR_RGB2RGBA) | |
colors = rgba.reshape(-1, 4) | |
clrs = [[128, 128, 128, 0]] | |
for i in range(0,1): #(0,4) | |
clrs = np.concatenate((clrs, colors), axis=0) | |
i = i+1 | |
#mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors) | |
mesh = trimesh.PointCloud(verts, colors=clrs) | |
#material = trimesh.visual.texture.SimpleMaterial(image=image) | |
#texture = trimesh.visual.TextureVisuals(uv=uv, image=image, material=material) | |
#mesh.visual = texture | |
scene = trimesh.Scene([mesh]) | |
print('mesh - ok') | |
# Save as glb | |
glb_file = tempfile.NamedTemporaryFile(suffix='.glb', delete=False) | |
glb_path = glb_file.name | |
scene.export(glb_path) | |
print('file - ok') | |
return glb_path | |
def loadurl(url): | |
return url | |
def select_frame(evt: gr.SelectData): | |
global frame_selected | |
if evt.index != frame_selected: | |
frame_selected = evt.index | |
return gr.Gallery(selected_index=evt.index, preview=True) | |
css = """ | |
#img-display-container { | |
max-height: 100vh; | |
} | |
#img-display-input { | |
max-height: 80vh; | |
} | |
#img-display-output { | |
max-height: 80vh; | |
} | |
""" | |
title = "# Depth Anything Video Demo" | |
description = """Depth Anything on full video files. | |
Please refer to our [paper](https://arxiv.org/abs/2401.10891), [project page](https://depth-anything.github.io), or [github](https://github.com/LiheYoung/Depth-Anything) for more details. | |
Mesh rendering from [ZoeDepth](https://huggingface.co/spaces/shariqfarooq/ZoeDepth) ([github](https://github.com/isl-org/ZoeDepth/tree/main/ui)).""" | |
transform = Compose([ | |
Resize( | |
width=518, | |
height=518, | |
resize_target=False, | |
keep_aspect_ratio=True, | |
ensure_multiple_of=14, | |
resize_method='lower_bound', | |
image_interpolation_method=cv2.INTER_CUBIC, | |
), | |
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | |
PrepareForNet(), | |
]) | |
# @torch.no_grad() | |
# def predict_depth(model, image): | |
# return model(image) | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(title) | |
gr.Markdown(description) | |
gr.Markdown("### Video Depth Prediction demo") | |
with gr.Row(): | |
with gr.Column(): | |
input_url = gr.Textbox(value="./examples/streetview.mp4", label="URL") | |
input_video = gr.Video(label="Input Video", format="mp4") | |
input_url.change(fn=loadurl, inputs=[input_url], outputs=[input_video]) | |
output_frame = gr.Gallery(label="Frame", type='numpy', preview=True, columns=8192) | |
output_depth = gr.Gallery(label="Depth", type='numpy', preview=True, columns=8192, interactive=False) | |
output_frame.select(fn=select_frame, inputs=None, outputs=[output_depth], show_progress='hidden') | |
output_depth.select(fn=select_frame, inputs=None, outputs=[output_frame], show_progress='hidden') | |
submit = gr.Button("Submit") | |
with gr.Column(): | |
model_type = gr.Dropdown([("small", "vits"), ("base", "vitb"), ("large", "vitl")], type="value", value="vits", label='Model Type') | |
processed_video = gr.Video(label="Output Video", format="mp4") | |
processed_zip = gr.File(label="Output Archive") | |
result = gr.Model3D(label="3D Mesh", clear_color=[0.5, 0.5, 0.5, 0.0], camera_position=[0, 90, 0]) | |
html = gr.HTML(value="""<label for='zoom'>Zoom</label><input id='zoom' type='range' style='width:256px;height:1em;' min='0.157' max='1.57' step='0.001' oninput=' | |
BABYLON.Engine.LastCreatedScene.getNodes()[1].material.pointSize = Math.ceil(Math.log2(Math.PI/this.value)); | |
if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) { | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata = { | |
screenshot: true, | |
pipeline: new BABYLON.DefaultRenderingPipeline(\"default\", true, BABYLON.Engine.LastCreatedScene, [BABYLON.Engine.LastCreatedScene.activeCamera]) | |
} | |
} | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.samples = 4; | |
BABYLON.Engine.LastCreatedScene.activeCamera.fov = this.value; | |
this.parentNode.childNodes[2].innerText = BABYLON.Engine.LastCreatedScene.activeCamera.fov; | |
document.getElementsByClassName(\"model3D\")[0].getElementsByTagName(\"canvas\")[0].style.filter = \"blur(\" + BABYLON.Engine.LastCreatedScene.getNodes()[1].material.pointSize/Math.sqrt(2.0) + \"px)\"; | |
'/><span>0.8</span>""") | |
camera = gr.HTML(value="<a href='#' onclick=\"BABYLON.Engine.LastCreatedScene.activeCamera.radius=0;\">reset camera</a>") | |
contrast = gr.HTML(value="""<label for='contrast'>Contrast</label><input id='contrast' type='range' style='width:256px;height:1em;' min='0' max='2' step='0.001' oninput=' | |
if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) { | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata = { | |
screenshot: true, | |
pipeline: new BABYLON.DefaultRenderingPipeline(\"default\", true, BABYLON.Engine.LastCreatedScene, [BABYLON.Engine.LastCreatedScene.activeCamera]) | |
} | |
} | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.samples = 4; | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.contrast = this.value; | |
this.parentNode.childNodes[2].innerText = BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.contrast; | |
'/><span>1.0</span>""") | |
exposure = gr.HTML(value="""<label for='exposure'>Exposure</label><input id='exposure' type='range' style='width:256px;height:1em;' min='0' max='2' step='0.001' oninput=' | |
if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) { | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata = { | |
screenshot: true, | |
pipeline: new BABYLON.DefaultRenderingPipeline(\"default\", true, BABYLON.Engine.LastCreatedScene, [BABYLON.Engine.LastCreatedScene.activeCamera]) | |
} | |
} | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.samples = 4; | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.exposure = this.value; | |
this.parentNode.childNodes[2].innerText = BABYLON.Engine.LastCreatedScene.activeCamera.metadata.pipeline.imageProcessing.exposure; | |
'/><span>1.0</span>""") | |
canvas = gr.HTML(value="""<a href='#' onclick=' | |
if (!BABYLON.Engine.LastCreatedScene.activeCamera.metadata) { | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata = { | |
screenshot: true, | |
pipeline: new BABYLON.DefaultRenderingPipeline(\"default\", true, BABYLON.Engine.LastCreatedScene, [BABYLON.Engine.LastCreatedScene.activeCamera]) | |
} | |
} | |
//var cntxt = document.getElementsByClassName(\"model3D\")[0].getElementsByTagName(\"canvas\")[0].getContext(\"webgl2\"); | |
//this.innerText = cntxt; | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata.screenshot = true; | |
BABYLON.Engine.LastCreatedScene.getEngine().onEndFrameObservable.add(function() { | |
if (BABYLON.Engine.LastCreatedScene.activeCamera.metadata.screenshot === true) { | |
BABYLON.Engine.LastCreatedScene.activeCamera.metadata.screenshot = false; | |
try { | |
BABYLON.Tools.CreateScreenshotUsingRenderTarget(BABYLON.Engine.LastCreatedScene.getEngine(), BABYLON.Engine.LastCreatedScene.activeCamera, | |
{ precision: 1.0 }, (durl) => { | |
document.getElementById(\"img_out\").src = durl; | |
document.getElementById(\"img_out\").onload = function() { | |
var svgd = `<svg id=\"svg_out\" viewBox=\"0 0 ` + document.getElementById(\"img_out\").width + ` ` + document.getElementById(\"img_out\").height + `\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"> | |
<defs> | |
<filter id=\"blur\" x=\"0\" y=\"0\" xmlns=\"http://www.w3.org/2000/svg\"> | |
<feGaussianBlur in=\"SourceGraphic\" stdDeviation=\"` + BABYLON.Engine.LastCreatedScene.getNodes()[1].material.pointSize/Math.sqrt(2.0) + `\" /> | |
</filter> | |
</defs> | |
<image filter=\"url(#blur)\" id=\"svg_img\" x=\"0\" y=\"0\" width=\"` + document.getElementById(\"img_out\").width + `\" height=\"` + document.getElementById(\"img_out\").height + `\" xlink:href=\"` + durl + `\"/> | |
</svg>`; | |
document.getElementById(\"img_out\").src = \"data:image/svg+xml;base64,\" + btoa(svgd); | |
document.getElementById(\"img_out\").onload = function() { | |
document.getElementById(\"cnv_out\").width = document.getElementById(\"img_out\").width; | |
document.getElementById(\"cnv_out\").height = document.getElementById(\"img_out\").height; | |
document.getElementById(\"cnv_out\").getContext(\"2d\").drawImage(img_out, 0, 0); | |
} | |
} | |
} | |
); | |
} catch(e) { alert(e); } | |
// https://forum.babylonjs.com/t/best-way-to-save-to-jpeg-snapshots-of-scene/17663/11 | |
} | |
}); | |
'/>webgl2</a><br/><img src='' id='img_out'/><br/> | |
<canvas id='cnv_out'/>""") | |
render = gr.Button("Render") | |
def on_submit(uploaded_video,model_type): | |
# Process the video and get the path of the output video | |
output_video_path = make_video(uploaded_video,encoder=model_type) | |
return output_video_path | |
submit.click(on_submit, inputs=[input_video, model_type], outputs=[processed_video, processed_zip, output_frame, output_depth]) | |
render.click(partial(get_mesh), inputs=[output_frame, output_depth], outputs=[result]) | |
example_files = os.listdir('examples') | |
example_files.sort() | |
example_files = [os.path.join('examples', filename) for filename in example_files] | |
examples = gr.Examples(examples=example_files, inputs=[input_video], outputs=[processed_video, processed_zip, output_frame, output_depth], fn=on_submit, cache_examples=True) | |
if __name__ == '__main__': | |
demo.queue().launch() |