|
import torch |
|
import imageio |
|
import os |
|
import gradio as gr |
|
from diffusers.schedulers import EulerAncestralDiscreteScheduler |
|
from transformers import T5EncoderModel, T5Tokenizer |
|
from allegro.pipelines.pipeline_allegro import AllegroPipeline |
|
from allegro.models.vae.vae_allegro import AllegroAutoencoderKL3D |
|
from allegro.models.transformers.transformer_3d_allegro import AllegroTransformer3DModel |
|
|
|
from huggingface_hub import snapshot_download |
|
|
|
weights_dir = './allegro_weights' |
|
os.makedirs(weights_dir, exist_ok=True) |
|
|
|
snapshot_download( |
|
repo_id='rhymes-ai/Allegro', |
|
allow_patterns=[ |
|
'scheduler/**', |
|
'text_encoder/**', |
|
'tokenizer/**', |
|
'transformer/**', |
|
'vae/**', |
|
], |
|
local_dir=weights_dir, |
|
local_dir_use_symlinks=False, |
|
) |
|
|
|
|
|
def single_inference(user_prompt, save_path, guidance_scale, num_sampling_steps, seed, enable_cpu_offload): |
|
dtype = torch.bfloat16 |
|
|
|
|
|
vae = AllegroAutoencoderKL3D.from_pretrained( |
|
"weights_dir/vae/", |
|
torch_dtype=torch.float32 |
|
).cuda() |
|
vae.eval() |
|
|
|
text_encoder = T5EncoderModel.from_pretrained("weights_dir/text_encoder/", torch_dtype=dtype) |
|
text_encoder.eval() |
|
|
|
tokenizer = T5Tokenizer.from_pretrained("weights_dir/tokenizer/") |
|
|
|
scheduler = EulerAncestralDiscreteScheduler() |
|
|
|
transformer = AllegroTransformer3DModel.from_pretrained("weights_dir/transformer/", torch_dtype=dtype).cuda() |
|
transformer.eval() |
|
|
|
allegro_pipeline = AllegroPipeline( |
|
vae=vae, |
|
text_encoder=text_encoder, |
|
tokenizer=tokenizer, |
|
scheduler=scheduler, |
|
transformer=transformer |
|
).to("cuda:0") |
|
|
|
positive_prompt = """ |
|
(masterpiece), (best quality), (ultra-detailed), (unwatermarked), |
|
{} |
|
emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, |
|
sharp focus, high budget, cinemascope, moody, epic, gorgeous |
|
""" |
|
|
|
negative_prompt = """ |
|
nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, |
|
low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry. |
|
""" |
|
|
|
|
|
user_prompt = positive_prompt.format(user_prompt.lower().strip()) |
|
|
|
if enable_cpu_offload: |
|
allegro_pipeline.enable_sequential_cpu_offload() |
|
|
|
out_video = allegro_pipeline( |
|
user_prompt, |
|
negative_prompt=negative_prompt, |
|
num_frames=88, |
|
height=720, |
|
width=1280, |
|
num_inference_steps=num_sampling_steps, |
|
guidance_scale=guidance_scale, |
|
max_sequence_length=512, |
|
generator=torch.Generator(device="cuda:0").manual_seed(seed) |
|
).video[0] |
|
|
|
|
|
os.makedirs(os.path.dirname(save_path), exist_ok=True) |
|
imageio.mimwrite(save_path, out_video, fps=15, quality=8) |
|
|
|
return save_path |
|
|
|
|
|
|
|
def run_inference(user_prompt, guidance_scale, num_sampling_steps, seed, enable_cpu_offload): |
|
save_path = "./output_videos/generated_video.mp4" |
|
result_path = single_inference(user_prompt, save_path, guidance_scale, num_sampling_steps, seed, enable_cpu_offload) |
|
return result_path |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=run_inference, |
|
inputs=[ |
|
gr.Textbox(label="User Prompt"), |
|
gr.Slider(minimum=0, maximum=20, step=0.1, label="Guidance Scale", value=7.5), |
|
gr.Slider(minimum=10, maximum=200, step=1, label="Number of Sampling Steps", value=100), |
|
gr.Slider(minimum=0, maximum=10000, step=1, label="Random Seed", value=42), |
|
gr.Checkbox(label="Enable CPU Offload", value=False), |
|
], |
|
outputs=gr.Video(label="Generated Video"), |
|
title="Allegro Video Generation", |
|
description="Generate a video based on a text prompt using the Allegro pipeline." |
|
) |
|
|
|
|
|
iface.launch() |
|
|