import gc import os import numpy as np import torch import argparse from diffusers.training_utils import set_seed from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames class DepthCrafterDemo: def __init__( self, unet_path: str, pre_train_path: str, cpu_offload: str = "model", ): unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained( unet_path, subfolder="unet", low_cpu_mem_usage=True, torch_dtype=torch.float16, ) # load weights of other components from the provided checkpoint self.pipe = DepthCrafterPipeline.from_pretrained( pre_train_path, unet=unet, torch_dtype=torch.float16, variant="fp16", ) # for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory if cpu_offload is not None: if cpu_offload == "sequential": # This will slow, but save more memory self.pipe.enable_sequential_cpu_offload() elif cpu_offload == "model": self.pipe.enable_model_cpu_offload() else: raise ValueError(f"Unknown cpu offload option: {cpu_offload}") else: self.pipe.to("cuda") # enable attention slicing and xformers memory efficient attention try: self.pipe.enable_xformers_memory_efficient_attention() except Exception as e: print(e) print("Xformers is not enabled") self.pipe.enable_attention_slicing() def infer( self, video: str, num_denoising_steps: int, guidance_scale: float, save_folder: str = "./demo_output", window_size: int = 110, process_length: int = 195, overlap: int = 25, max_res: int = 1024, target_fps: int = 15, seed: int = 42, track_time: bool = True, save_npz: bool = False, ): set_seed(seed) frames, target_fps = read_video_frames( video, process_length, target_fps, max_res ) print(f"==> video name: {video}, frames shape: {frames.shape}") # inference the depth map using the DepthCrafter pipeline with torch.inference_mode(): res = self.pipe( frames, height=frames.shape[1], width=frames.shape[2], output_type="np", guidance_scale=guidance_scale, num_inference_steps=num_denoising_steps, window_size=window_size, overlap=overlap, track_time=track_time, ).frames[0] # convert the three-channel output to a single channel depth map res = res.sum(-1) / res.shape[-1] # normalize the depth map to [0, 1] across the whole video res = (res - res.min()) / (res.max() - res.min()) # visualize the depth map and save the results vis = vis_sequence_depth(res) # save the depth map and visualization with the target FPS save_path = os.path.join( save_folder, os.path.splitext(os.path.basename(video))[0] ) os.makedirs(os.path.dirname(save_path), exist_ok=True) if save_npz: np.savez_compressed(save_path + ".npz", depth=res) save_video(res, save_path + "_depth.mp4", fps=target_fps) save_video(vis, save_path + "_vis.mp4", fps=target_fps) save_video(frames, save_path + "_input.mp4", fps=target_fps) return [ save_path + "_input.mp4", save_path + "_vis.mp4", save_path + "_depth.mp4", ] def run( self, input_video, num_denoising_steps, guidance_scale, max_res=1024, process_length=195, ): res_path = self.infer( input_video, num_denoising_steps, guidance_scale, max_res=max_res, process_length=process_length, ) # clear the cache for the next video gc.collect() torch.cuda.empty_cache() return res_path[:2] if __name__ == "__main__": # running configs # the most important arguments for memory saving are `cpu_offload`, `enable_xformers`, `max_res`, and `window_size` # the most important arguments for trade-off between quality and speed are # `num_inference_steps`, `guidance_scale`, and `max_res` parser = argparse.ArgumentParser(description="DepthCrafter") parser.add_argument( "--video-path", type=str, required=True, help="Path to the input video file(s)" ) parser.add_argument( "--save-folder", type=str, default="./demo_output", help="Folder to save the output", ) parser.add_argument( "--unet-path", type=str, default="tencent/DepthCrafter", help="Path to the UNet model", ) parser.add_argument( "--pre-train-path", type=str, default="stabilityai/stable-video-diffusion-img2vid-xt", help="Path to the pre-trained model", ) parser.add_argument( "--process-length", type=int, default=195, help="Number of frames to process" ) parser.add_argument( "--cpu-offload", type=str, default="model", choices=["model", "sequential", None], help="CPU offload option", ) parser.add_argument( "--target-fps", type=int, default=15, help="Target FPS for the output video" ) # -1 for original fps parser.add_argument("--seed", type=int, default=42, help="Random seed") parser.add_argument( "--num-inference-steps", type=int, default=25, help="Number of inference steps" ) parser.add_argument( "--guidance-scale", type=float, default=1.2, help="Guidance scale" ) parser.add_argument("--window-size", type=int, default=110, help="Window size") parser.add_argument("--overlap", type=int, default=25, help="Overlap size") parser.add_argument("--max-res", type=int, default=1024, help="Maximum resolution") parser.add_argument("--save_npz", type=bool, default=True, help="Save npz file") parser.add_argument("--track_time", type=bool, default=False, help="Track time") args = parser.parse_args() depthcrafter_demo = DepthCrafterDemo( unet_path=args.unet_path, pre_train_path=args.pre_train_path, cpu_offload=args.cpu_offload, ) # process the videos, the video paths are separated by comma video_paths = args.video_path.split(",") for video in video_paths: depthcrafter_demo.infer( video, args.num_inference_steps, args.guidance_scale, save_folder=args.save_folder, window_size=args.window_size, process_length=args.process_length, overlap=args.overlap, max_res=args.max_res, target_fps=args.target_fps, seed=args.seed, track_time=args.track_time, save_npz=args.save_npz, ) # clear the cache for the next video gc.collect() torch.cuda.empty_cache()