CogVideoX-5B-Space

Running

App Files Files Community

CogVideoX-5B-Space / cogvideox_v2v_colab.py

svjack

Upload 10 files

ef41b32 verified about 2 months ago

raw

history blame contribute delete

4.47 kB

	# -- coding: utf-8 --
	"""CogVideoX-V2V-Colab

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1comfGAUJnChl5NwPuO8Ox5_6WCy4kbNN

	## CogVideoX Video-to-Video

	This notebook demonstrates how to run [CogVideoX-2b](https://huggingface.co/THUDM/CogVideoX-2b) and [CogVideoX-5b](https://huggingface.co/THUDM/CogVideoX-5b) with 🧨 Diffusers on a free-tier Colab GPU.

	Additional resources:
	- [Docs](https://huggingface.co/docs/diffusers/en/api/pipelines/cogvideox)
	- [Quantization with TorchAO](https://github.com/sayakpaul/diffusers-torchao/)
	- [Quantization with Quanto](https://gist.github.com/a-r-r-o-w/31be62828b00a9292821b85c1017effa)

	#### Install the necessary requirements
	"""

	!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121

	!pip install diffusers transformers hf_transfer

	!pip install git+https://github.com/huggingface/accelerate

	"""#### Import required libraries

	The following block is optional but if enabled, downloading models from the HF Hub will be much faster
	"""

	!pip install imageio-ffmpeg

	import os
	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

	import torch
	from diffusers import AutoencoderKLCogVideoX, CogVideoXVideoToVideoPipeline, CogVideoXTransformer3DModel, CogVideoXDPMScheduler
	from diffusers.utils import export_to_video, load_video
	from transformers import T5EncoderModel

	"""#### Load models and create pipeline

	Note: `bfloat16`, which is the recommended dtype for running "CogVideoX-5b" will cause OOM errors due to lack of efficient support on Turing GPUs.

	Therefore, we must use `float16`, which might result in poorer generation quality. The recommended solution is to use Ampere or above GPUs, which also support memory efficient quantization kernels from [TorchAO](https://github.com/pytorch/ao) :(
	"""

	# Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
	model_id = "THUDM/CogVideoX-5b"

	# Thank you [@camenduru](https://github.com/camenduru)!
	# The reason for using checkpoints hosted by Camenduru instead of the original is because they exported
	# with a max_shard_size of "5GB" when saving the model with `.save_pretrained`. The original converted
	# model was saved with "10GB" as the max shard size, which causes the Colab CPU RAM to be insufficient
	# leading to OOM (on the CPU)

	transformer = CogVideoXTransformer3DModel.from_pretrained("camenduru/cogvideox-5b-float16", subfolder="transformer", torch_dtype=torch.float16)
	text_encoder = T5EncoderModel.from_pretrained("camenduru/cogvideox-5b-float16", subfolder="text_encoder", torch_dtype=torch.float16)
	vae = AutoencoderKLCogVideoX.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16)

	# Create pipeline and run inference
	pipe = CogVideoXVideoToVideoPipeline.from_pretrained(
	model_id,
	text_encoder=text_encoder,
	transformer=transformer,
	vae=vae,
	torch_dtype=torch.float16,
	)
	pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)

	"""#### Enable memory optimizations

	Note that sequential cpu offloading is necessary for being able to run the model on Turing or lower architectures. It aggressively maintains everything on the CPU and only moves the currently executing nn.Module to the GPU. This saves a lot of VRAM but adds a lot of overhead for inference, making generations extremely slow (30 minutes+). Unfortunately, this is the only solution for running the model on Colab until efficient kernels are supported.
	"""

	pipe.enable_sequential_cpu_offload()
	pipe.vae.enable_tiling()

	"""#### Generate!"""

	input_video = load_video(
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4"
	)
	prompt = (
	"An astronaut stands triumphantly at the peak of a towering mountain. Panorama of rugged peaks and "
	"valleys. Very futuristic vibe and animated aesthetic. Highlights of purple and golden colors in "
	"the scene. The sky is looks like an animated/cartoonish dream of galaxies, nebulae, stars, planets, "
	"moons, but the remainder of the scene is mostly realistic."
	)

	video = pipe(video=input_video, prompt=prompt, strength=0.7, guidance_scale=6, use_dynamic_cfg=True, num_inference_steps=50).frames[0]

	export_to_video(video, "output.mp4", fps=8)

	export_to_video(input_video, "input.mp4", fps=8)

	from IPython.display import display, Video
	display(Video("input.mp4", embed=True))
	display(Video("output.mp4", embed=True))