import gradio as gr import torch import torchvision from diffusers import I2VGenXLPipeline from diffusers.utils.loading_utils import load_image from PIL import Image def generate(image: Image.Image, prompt: str): negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" generator = torch.manual_seed(8888) image = image.convert("RGB") pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") pipeline.enable_model_cpu_offload() pipeline.unet.enable_forward_chunking() frames = pipeline( prompt=prompt, image=image, num_inference_steps=50, negative_prompt=negative_prompt, guidance_scale=9.0, generator=generator, decode_chunk_size=6, ).frames[0] torchvision.io.write_video("video.mp4", frames, fps=16) return "video.mp4" app = gr.Interface( fn=generate, inputs=[gr.Image(type="pil"), "text"], outputs=gr.Video() ) if __name__ == "__main__": app.launch()