CogVideoX-5B-Space / cogvideox_t2v_colab.py
svjack's picture
Upload 10 files
ef41b32 verified
# -*- coding: utf-8 -*-
"""CogVideoX-T2V-Colab.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1pCe5s0bC_xuXbBlpvIH1z0kfdTLQPzCS
## CogVideoX Text-to-Video
This notebook demonstrates how to run [CogVideoX-2b](https://huggingface.co/THUDM/CogVideoX-2b) and [CogVideoX-5b](https://huggingface.co/THUDM/CogVideoX-5b) with 🧨 Diffusers on a free-tier Colab GPU.
Additional resources:
- [Docs](https://huggingface.co/docs/diffusers/en/api/pipelines/cogvideox)
- [Quantization with TorchAO](https://github.com/sayakpaul/diffusers-torchao/)
- [Quantization with Quanto](https://gist.github.com/a-r-r-o-w/31be62828b00a9292821b85c1017effa)
Note: If, for whatever reason, you randomly get an OOM error, give it a try on Kaggle T4 instances instead. I've found that Colab free-tier T4 can be unreliable at times. Sometimes, the notebook will run smoothly, but other times it will crash with an error 🤷🏻‍♂️
#### Install the necessary requirements
"""
!pip install diffusers transformers hf_transfer
# !pip install git+https://github.com/huggingface/accelerate
!pip install accelerate==0.33.0
"""#### Import required libraries
The following block is optional but if enabled, downloading models from the HF Hub will be much faster
"""
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransformer3DModel
from diffusers.utils import export_to_video
from transformers import T5EncoderModel
"""#### Load models and create pipeline
Note: `bfloat16`, which is the recommended dtype for running "CogVideoX-5b" will cause OOM errors due to lack of efficient support on Turing GPUs.
Therefore, we must use `float16`, which might result in poorer generation quality. The recommended solution is to use Ampere or above GPUs, which also support efficient quantization kernels from [TorchAO](https://github.com/pytorch/ao) :(
"""
# Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
model_id = "THUDM/CogVideoX-5b"
# Thank you [@camenduru](https://github.com/camenduru)!
# The reason for using checkpoints hosted by Camenduru instead of the original is because they exported
# with a max_shard_size of "5GB" when saving the model with `.save_pretrained`. The original converted
# model was saved with "10GB" as the max shard size, which causes the Colab CPU RAM to be insufficient
# leading to OOM (on the CPU)
transformer = CogVideoXTransformer3DModel.from_pretrained("camenduru/cogvideox-5b-float16", subfolder="transformer", torch_dtype=torch.float16)
text_encoder = T5EncoderModel.from_pretrained("camenduru/cogvideox-5b-float16", subfolder="text_encoder", torch_dtype=torch.float16)
vae = AutoencoderKLCogVideoX.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16)
# Create pipeline and run inference
pipe = CogVideoXPipeline.from_pretrained(
model_id,
text_encoder=text_encoder,
transformer=transformer,
vae=vae,
torch_dtype=torch.float16,
)
"""#### Enable memory optimizations
Note that sequential cpu offloading is necessary for being able to run the model on Turing or lower architectures. It aggressively maintains everything on the CPU and only moves the currently executing nn.Module to the GPU. This saves a lot of VRAM but adds a lot of overhead for inference, making generations extremely slow (1 hour+). Unfortunately, this is the only solution for running the model on Colab until efficient kernels are supported.
"""
pipe.enable_sequential_cpu_offload()
# pipe.vae.enable_tiling()
"""#### Generate!"""
prompt = (
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
"The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
"pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
"casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
"The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
"atmosphere of this unique musical performance."
)
video = pipe(prompt=prompt, guidance_scale=6, use_dynamic_cfg=True, num_inference_steps=50).frames[0]
export_to_video(video, "output.mp4", fps=8)