Spaces:

tencent
/

DepthCrafter

Running on Zero

App Files Files Community

DepthCrafter / run.py

wbhu-tc

update

7c1a14b 2 months ago

raw

history blame

7.49 kB

	import gc
	import os
	import numpy as np
	import torch
	import argparse
	from diffusers.training_utils import set_seed

	from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
	from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
	from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames


	class DepthCrafterDemo:
	def __init__(
	self,
	unet_path: str,
	pre_train_path: str,
	cpu_offload: str = "model",
	):
	unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
	unet_path,
	subfolder="unet",
	low_cpu_mem_usage=True,
	torch_dtype=torch.float16,
	)
	# load weights of other components from the provided checkpoint
	self.pipe = DepthCrafterPipeline.from_pretrained(
	pre_train_path,
	unet=unet,
	torch_dtype=torch.float16,
	variant="fp16",
	)

	# for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory
	if cpu_offload is not None:
	if cpu_offload == "sequential":
	# This will slow, but save more memory
	self.pipe.enable_sequential_cpu_offload()
	elif cpu_offload == "model":
	self.pipe.enable_model_cpu_offload()
	else:
	raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
	else:
	self.pipe.to("cuda")
	# enable attention slicing and xformers memory efficient attention
	try:
	self.pipe.enable_xformers_memory_efficient_attention()
	except Exception as e:
	print(e)
	print("Xformers is not enabled")
	self.pipe.enable_attention_slicing()

	def infer(
	self,
	video: str,
	num_denoising_steps: int,
	guidance_scale: float,
	save_folder: str = "./demo_output",
	window_size: int = 110,
	process_length: int = 195,
	overlap: int = 25,
	max_res: int = 1024,
	target_fps: int = 15,
	seed: int = 42,
	track_time: bool = True,
	save_npz: bool = False,
	):
	set_seed(seed)

	frames, target_fps = read_video_frames(
	video, process_length, target_fps, max_res
	)
	print(f"==> video name: {video}, frames shape: {frames.shape}")

	# inference the depth map using the DepthCrafter pipeline
	with torch.inference_mode():
	res = self.pipe(
	frames,
	height=frames.shape[1],
	width=frames.shape[2],
	output_type="np",
	guidance_scale=guidance_scale,
	num_inference_steps=num_denoising_steps,
	window_size=window_size,
	overlap=overlap,
	track_time=track_time,
	).frames[0]
	# convert the three-channel output to a single channel depth map
	res = res.sum(-1) / res.shape[-1]
	# normalize the depth map to [0, 1] across the whole video
	res = (res - res.min()) / (res.max() - res.min())
	# visualize the depth map and save the results
	vis = vis_sequence_depth(res)
	# save the depth map and visualization with the target FPS
	save_path = os.path.join(
	save_folder, os.path.splitext(os.path.basename(video))[0]
	)
	os.makedirs(os.path.dirname(save_path), exist_ok=True)
	if save_npz:
	np.savez_compressed(save_path + ".npz", depth=res)
	save_video(res, save_path + "_depth.mp4", fps=target_fps)
	save_video(vis, save_path + "_vis.mp4", fps=target_fps)
	save_video(frames, save_path + "_input.mp4", fps=target_fps)
	return [
	save_path + "_input.mp4",
	save_path + "_vis.mp4",
	save_path + "_depth.mp4",
	]

	def run(
	self,
	input_video,
	num_denoising_steps,
	guidance_scale,
	max_res=1024,
	process_length=195,
	):
	res_path = self.infer(
	input_video,
	num_denoising_steps,
	guidance_scale,
	max_res=max_res,
	process_length=process_length,
	)
	# clear the cache for the next video
	gc.collect()
	torch.cuda.empty_cache()
	return res_path[:2]


	if __name__ == "__main__":
	# running configs
	# the most important arguments for memory saving are `cpu_offload`, `enable_xformers`, `max_res`, and `window_size`
	# the most important arguments for trade-off between quality and speed are
	# `num_inference_steps`, `guidance_scale`, and `max_res`
	parser = argparse.ArgumentParser(description="DepthCrafter")
	parser.add_argument(
	"--video-path", type=str, required=True, help="Path to the input video file(s)"
	)
	parser.add_argument(
	"--save-folder",
	type=str,
	default="./demo_output",
	help="Folder to save the output",
	)
	parser.add_argument(
	"--unet-path",
	type=str,
	default="tencent/DepthCrafter",
	help="Path to the UNet model",
	)
	parser.add_argument(
	"--pre-train-path",
	type=str,
	default="stabilityai/stable-video-diffusion-img2vid-xt",
	help="Path to the pre-trained model",
	)
	parser.add_argument(
	"--process-length", type=int, default=195, help="Number of frames to process"
	)
	parser.add_argument(
	"--cpu-offload",
	type=str,
	default="model",
	choices=["model", "sequential", None],
	help="CPU offload option",
	)
	parser.add_argument(
	"--target-fps", type=int, default=15, help="Target FPS for the output video"
	) # -1 for original fps
	parser.add_argument("--seed", type=int, default=42, help="Random seed")
	parser.add_argument(
	"--num-inference-steps", type=int, default=25, help="Number of inference steps"
	)
	parser.add_argument(
	"--guidance-scale", type=float, default=1.2, help="Guidance scale"
	)
	parser.add_argument("--window-size", type=int, default=110, help="Window size")
	parser.add_argument("--overlap", type=int, default=25, help="Overlap size")
	parser.add_argument("--max-res", type=int, default=1024, help="Maximum resolution")
	parser.add_argument("--save_npz", type=bool, default=True, help="Save npz file")
	parser.add_argument("--track_time", type=bool, default=False, help="Track time")

	args = parser.parse_args()

	depthcrafter_demo = DepthCrafterDemo(
	unet_path=args.unet_path,
	pre_train_path=args.pre_train_path,
	cpu_offload=args.cpu_offload,
	)
	# process the videos, the video paths are separated by comma
	video_paths = args.video_path.split(",")
	for video in video_paths:
	depthcrafter_demo.infer(
	video,
	args.num_inference_steps,
	args.guidance_scale,
	save_folder=args.save_folder,
	window_size=args.window_size,
	process_length=args.process_length,
	overlap=args.overlap,
	max_res=args.max_res,
	target_fps=args.target_fps,
	seed=args.seed,
	track_time=args.track_time,
	save_npz=args.save_npz,
	)
	# clear the cache for the next video
	gc.collect()
	torch.cuda.empty_cache()