CogVideoX-5B-Space

Running

File size: 4,473 Bytes

ef41b32

# -*- coding: utf-8 -*-
"""CogVideoX-V2V-Colab

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1comfGAUJnChl5NwPuO8Ox5_6WCy4kbNN

## CogVideoX Video-to-Video

This notebook demonstrates how to run [CogVideoX-2b](https://huggingface.co/THUDM/CogVideoX-2b) and [CogVideoX-5b](https://huggingface.co/THUDM/CogVideoX-5b) with 🧨 Diffusers on a free-tier Colab GPU.

Additional resources:
- [Docs](https://huggingface.co/docs/diffusers/en/api/pipelines/cogvideox)
- [Quantization with TorchAO](https://github.com/sayakpaul/diffusers-torchao/)
- [Quantization with Quanto](https://gist.github.com/a-r-r-o-w/31be62828b00a9292821b85c1017effa)

#### Install the necessary requirements
"""

!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121

!pip install diffusers transformers hf_transfer

!pip install git+https://github.com/huggingface/accelerate

"""#### Import required libraries

The following block is optional but if enabled, downloading models from the HF Hub will be much faster
"""

!pip install imageio-ffmpeg

import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXVideoToVideoPipeline, CogVideoXTransformer3DModel, CogVideoXDPMScheduler
from diffusers.utils import export_to_video, load_video
from transformers import T5EncoderModel

"""#### Load models and create pipeline

Note: `bfloat16`, which is the recommended dtype for running "CogVideoX-5b" will cause OOM errors due to lack of efficient support on Turing GPUs.

Therefore, we must use `float16`, which might result in poorer generation quality. The recommended solution is to use Ampere or above GPUs, which also support memory efficient quantization kernels from [TorchAO](https://github.com/pytorch/ao) :(
"""

# Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
model_id = "THUDM/CogVideoX-5b"

# Thank you [@camenduru](https://github.com/camenduru)!
# The reason for using checkpoints hosted by Camenduru instead of the original is because they exported
# with a max_shard_size of "5GB" when saving the model with `.save_pretrained`. The original converted
# model was saved with "10GB" as the max shard size, which causes the Colab CPU RAM to be insufficient
# leading to OOM (on the CPU)

transformer = CogVideoXTransformer3DModel.from_pretrained("camenduru/cogvideox-5b-float16", subfolder="transformer", torch_dtype=torch.float16)
text_encoder = T5EncoderModel.from_pretrained("camenduru/cogvideox-5b-float16", subfolder="text_encoder", torch_dtype=torch.float16)
vae = AutoencoderKLCogVideoX.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16)

# Create pipeline and run inference
pipe = CogVideoXVideoToVideoPipeline.from_pretrained(
    model_id,
    text_encoder=text_encoder,
    transformer=transformer,
    vae=vae,
    torch_dtype=torch.float16,
)
pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)

"""#### Enable memory optimizations

Note that sequential cpu offloading is necessary for being able to run the model on Turing or lower architectures. It aggressively maintains everything on the CPU and only moves the currently executing nn.Module to the GPU. This saves a lot of VRAM but adds a lot of overhead for inference, making generations extremely slow (30 minutes+). Unfortunately, this is the only solution for running the model on Colab until efficient kernels are supported.
"""

pipe.enable_sequential_cpu_offload()
pipe.vae.enable_tiling()

"""#### Generate!"""

input_video = load_video(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4"
)
prompt = (
    "An astronaut stands triumphantly at the peak of a towering mountain. Panorama of rugged peaks and "
    "valleys. Very futuristic vibe and animated aesthetic. Highlights of purple and golden colors in "
    "the scene. The sky is looks like an animated/cartoonish dream of galaxies, nebulae, stars, planets, "
    "moons, but the remainder of the scene is mostly realistic."
)

video = pipe(video=input_video, prompt=prompt, strength=0.7, guidance_scale=6, use_dynamic_cfg=True, num_inference_steps=50).frames[0]

export_to_video(video, "output.mp4", fps=8)

export_to_video(input_video, "input.mp4", fps=8)

from IPython.display import display, Video
display(Video("input.mp4", embed=True))
display(Video("output.mp4", embed=True))