Proyecto_1 / app.py
LiquidoNoNewtoniano's picture
Update app.py
6c08d2a
raw
history blame
1.79 kB
import gradio as gr
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
import torch
import accelerate
from PIL import Image
from diffusers.utils import export_to_video
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
pipe = pipe.to(device)
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def image_to_text(image_paths):
images=[image_paths]
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
output_ids = model.generate(pixel_values, **gen_kwargs)
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
return preds[0]
def text_to_video(image_paths):
prompt = image_to_text(image_paths)
video_frames = pipe(prompt, num_inference_steps=25).frames
video_path = export_to_video(video_frames)
return video_frames
title = ""
description = ""
interface = gr.Interface(
fn=text_to_video,
inputs=gr.Image(type="pil"),
outputs=gr.Video(),
title=title,
description=description,
)
interface.launch(debug=True)