Spaces:

fffiloni
/

allegro-text2video

Running

File size: 9,792 Bytes

import torch
import imageio
import os
import gradio as gr
from diffusers.schedulers import EulerAncestralDiscreteScheduler
from transformers import T5EncoderModel, T5Tokenizer
from allegro.pipelines.pipeline_allegro import AllegroPipeline
from allegro.models.vae.vae_allegro import AllegroAutoencoderKL3D
from allegro.models.transformers.transformer_3d_allegro import AllegroTransformer3DModel

from huggingface_hub import snapshot_download

weights_dir = './allegro_weights'
os.makedirs(weights_dir, exist_ok=True)

is_shared_ui = True if "fffiloni/allegro-t2v" in os.environ['SPACE_ID'] else False
is_gpu_associated = torch.cuda.is_available()

if not is_shared_ui:
    snapshot_download(
        repo_id='rhymes-ai/Allegro',
        allow_patterns=[
            'scheduler/**',
            'text_encoder/**',
            'tokenizer/**',
            'transformer/**',
            'vae/**',
        ],
        local_dir=weights_dir,
    )


def single_inference(user_prompt, save_path, guidance_scale, num_sampling_steps, seed, enable_cpu_offload):
    dtype = torch.bfloat16

    # Load models
    vae = AllegroAutoencoderKL3D.from_pretrained(
        "./allegro_weights/vae/", 
        torch_dtype=torch.float32
    ).cuda()
    vae.eval()

    text_encoder = T5EncoderModel.from_pretrained("./allegro_weights/text_encoder/", torch_dtype=dtype)
    text_encoder.eval()

    tokenizer = T5Tokenizer.from_pretrained("./allegro_weights/tokenizer/")

    scheduler = EulerAncestralDiscreteScheduler()

    transformer = AllegroTransformer3DModel.from_pretrained("./allegro_weights/transformer/", torch_dtype=dtype).cuda()
    transformer.eval()

    allegro_pipeline = AllegroPipeline(
        vae=vae,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        scheduler=scheduler,
        transformer=transformer
    ).to("cuda:0")

    positive_prompt = """
    (masterpiece), (best quality), (ultra-detailed), (unwatermarked), 
    {} 
    emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, 
    sharp focus, high budget, cinemascope, moody, epic, gorgeous
    """

    negative_prompt = """
    nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, 
    low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry.
    """

    # Process user prompt
    user_prompt = positive_prompt.format(user_prompt.lower().strip())

    if enable_cpu_offload:
        allegro_pipeline.enable_sequential_cpu_offload()

    out_video = allegro_pipeline(
        user_prompt, 
        negative_prompt=negative_prompt, 
        num_frames=88,
        height=720,
        width=1280,
        num_inference_steps=num_sampling_steps,
        guidance_scale=guidance_scale,
        max_sequence_length=512,
        generator=torch.Generator(device="cuda:0").manual_seed(seed)
    ).video[0]

    # Save video
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    imageio.mimwrite(save_path, out_video, fps=15, quality=8)

    return save_path


# Gradio interface function
def run_inference(user_prompt, guidance_scale, num_sampling_steps, seed, enable_cpu_offload, progress=gr.Progress(track_tqdm=True)):
    save_path = "./output_videos/generated_video.mp4"
    result_path = single_inference(user_prompt, save_path, guidance_scale, num_sampling_steps, seed, enable_cpu_offload)
    return result_path

css="""
#upl-dataset-group {background-color: none!important;}
div#warning-ready {
    background-color: #ecfdf5;
    padding: 0 16px 16px;
    margin: 20px 0;
}
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
    color: #057857!important;
}
div#warning-duplicate {
    background-color: #ebf5ff;
    padding: 0 16px 16px;
    margin: 20px 0;
}
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
    color: #0f4592!important;
}
div#warning-duplicate strong {
    color: #0f4592;
}
p.actions {
    display: flex;
    align-items: center;
    margin: 20px 0;
}
div#warning-duplicate .actions a {
    display: inline-block;
    margin-right: 10px;
}
div#warning-setgpu {
    background-color: #fff4eb;
    padding: 0 16px 16px;
    margin: 20px 0;
}
div#warning-setgpu > .gr-prose > h2, div#warning-setgpu > .gr-prose > p {
    color: #92220f!important;
}
div#warning-setgpu a, div#warning-setgpu b {
    color: #91230f;
}
div#warning-setgpu p.actions > a {
    display: inline-block;
    background: #1f1f23;
    border-radius: 40px;
    padding: 6px 24px;
    color: antiquewhite;
    text-decoration: none;
    font-weight: 600;
    font-size: 1.2em;
}
"""

# Create Gradio interface
with gr.Blocks(css=css) as demo:
    with gr.Column():
        gr.Markdown("# Allegro Video Generation")
        gr.Markdown("Generate a video based on a text prompt using the Allegro pipeline.")
        with gr.Row():
            with gr.Column():
                user_prompt=gr.Textbox(label="User Prompt")
                with gr.Row():
                    guidance_scale=gr.Slider(minimum=0, maximum=20, step=0.1, label="Guidance Scale", value=7.5)
                    num_sampling_steps=gr.Slider(minimum=10, maximum=100, step=1, label="Number of Sampling Steps", value=20)
                with gr.Row():
                    seed=gr.Slider(minimum=0, maximum=10000, step=1, label="Random Seed", value=42)
                    enable_cpu_offload=gr.Checkbox(label="Enable CPU Offload", value=False, scale=1)
                if is_shared_ui:
                    top_description = gr.HTML(f'''
                        <div class="gr-prose">
                            <h2><svg xmlns="http://www.w3.org/2000/svg" width="18px" height="18px" style="margin-right: 0px;display: inline-block;"fill="none"><path fill="#fff" d="M7 13.2a6.3 6.3 0 0 0 4.4-10.7A6.3 6.3 0 0 0 .6 6.9 6.3 6.3 0 0 0 7 13.2Z"/><path fill="#fff" fill-rule="evenodd" d="M7 0a6.9 6.9 0 0 1 4.8 11.8A6.9 6.9 0 0 1 0 7 6.9 6.9 0 0 1 7 0Zm0 0v.7V0ZM0 7h.6H0Zm7 6.8v-.6.6ZM13.7 7h-.6.6ZM9.1 1.7c-.7-.3-1.4-.4-2.2-.4a5.6 5.6 0 0 0-4 1.6 5.6 5.6 0 0 0-1.6 4 5.6 5.6 0 0 0 1.6 4 5.6 5.6 0 0 0 4 1.7 5.6 5.6 0 0 0 4-1.7 5.6 5.6 0 0 0 1.7-4 5.6 5.6 0 0 0-1.7-4c-.5-.5-1.1-.9-1.8-1.2Z" clip-rule="evenodd"/><path fill="#000" fill-rule="evenodd" d="M7 2.9a.8.8 0 1 1 0 1.5A.8.8 0 0 1 7 3ZM5.8 5.7c0-.4.3-.6.6-.6h.7c.3 0 .6.2.6.6v3.7h.5a.6.6 0 0 1 0 1.3H6a.6.6 0 0 1 0-1.3h.4v-3a.6.6 0 0 1-.6-.7Z" clip-rule="evenodd"/></svg>
                            Attention: this Space need to be duplicated to work</h2>
                            <p class="main-message">
                                To make it work, <strong>duplicate the Space</strong> and run it on your own profile using a <strong>private</strong> GPU.<br />
                                You'll be able to offload the model into CPU for less GPU memory cost (about 9.3G, compared to 27.5G if CPU offload is not enabled), but the inference time will increase significantly.
                            </p>
                            <p class="actions">
                                <a href="https://huggingface.co/spaces/{os.environ['SPACE_ID']}?duplicate=true">
                                    <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg-dark.svg" alt="Duplicate this Space" />
                                </a>
                            </p>
                        </div>
                    ''', elem_id="warning-duplicate")
                else:
                    if(is_gpu_associated):
                        submit_btn = gr.Button("Generate Video", visible=False)
                    else:
                        top_description = gr.HTML(f'''
                                <div class="gr-prose">
                                <h2><svg xmlns="http://www.w3.org/2000/svg" width="18px" height="18px" style="margin-right: 0px;display: inline-block;"fill="none"><path fill="#fff" d="M7 13.2a6.3 6.3 0 0 0 4.4-10.7A6.3 6.3 0 0 0 .6 6.9 6.3 6.3 0 0 0 7 13.2Z"/><path fill="#fff" fill-rule="evenodd" d="M7 0a6.9 6.9 0 0 1 4.8 11.8A6.9 6.9 0 0 1 0 7 6.9 6.9 0 0 1 7 0Zm0 0v.7V0ZM0 7h.6H0Zm7 6.8v-.6.6ZM13.7 7h-.6.6ZM9.1 1.7c-.7-.3-1.4-.4-2.2-.4a5.6 5.6 0 0 0-4 1.6 5.6 5.6 0 0 0-1.6 4 5.6 5.6 0 0 0 1.6 4 5.6 5.6 0 0 0 4 1.7 5.6 5.6 0 0 0 4-1.7 5.6 5.6 0 0 0 1.7-4 5.6 5.6 0 0 0-1.7-4c-.5-.5-1.1-.9-1.8-1.2Z" clip-rule="evenodd"/><path fill="#000" fill-rule="evenodd" d="M7 2.9a.8.8 0 1 1 0 1.5A.8.8 0 0 1 7 3ZM5.8 5.7c0-.4.3-.6.6-.6h.7c.3 0 .6.2.6.6v3.7h.5a.6.6 0 0 1 0 1.3H6a.6.6 0 0 1 0-1.3h.4v-3a.6.6 0 0 1-.6-.7Z" clip-rule="evenodd"/></svg>
                                You have successfully duplicated the Allegro Video Generation Space 🎉</h2>
                                <p>There's only one step left before you can generate a video: <a href="https://huggingface.co/spaces/{os.environ['SPACE_ID']}/settings" style="text-decoration: underline" target="_blank">attribute a GPU</b> to it (via the Settings tab)</a>.
                                You will be billed by the minute from when you activate the GPU until when it is turned off.</p> 
                                <p class="actions">
                                    <a href="https://huggingface.co/spaces/{os.environ['SPACE_ID']}/settings">🔥 &nbsp; Set recommended GPU</a>
                                </p>
                                </div>
                        ''', elem_id="warning-setgpu")
                    
            with gr.Column():
                video_output=gr.Video(label="Generated Video")

submit_btn.click(
    fn=run_inference,
    inputs=[user_prompt, guidance_scale, num_sampling_steps, seed, enable_cpu_offload],
    outputs=video_output
)

# Launch the interface
demo.launch(show_error=True, show_api=False)