Proyecto_1 / app.py
LiquidoNoNewtoniano's picture
Update app.py
7bcd929
raw
history blame
1.78 kB
import gradio as gr
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import torch
from PIL import Image
from diffusers.utils import export_to_video
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
pipe = pipe.to(device)
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def image_to_text(image_paths):
images=[image_paths]
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
output_ids = model.generate(pixel_values, **gen_kwargs)
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
return preds[0]
def text_to_video(image_paths):
prompt = image_to_text(image_paths)
video_frames = pipe(prompt, num_inference_steps=25).frames
video_path = export_to_video(video_frames)
return video_frames
title = ""
description = ""
interface = gr.Interface(
fn=text_to_video,
inputs=gr.inputs.Image(type="pil"),
outputs=gr.Video(),
title=title,
description=description,
)
interface.launch(debug=True)