|
from decord import VideoReader |
|
import torch |
|
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel |
|
import gradio as gr |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base") |
|
tokenizer = AutoTokenizer.from_pretrained("gpt2") |
|
model = VisionEncoderDecoderModel.from_pretrained( |
|
"Neleac/timesformer-gpt2-video-captioning" |
|
).to(device) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
demo.title = "Video Captioning" |
|
gr.Markdown('# Video Captioning, demo by AISEED') |
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
video = gr.Video(label="Upload Video", format="mp4") |
|
generate = gr.Button(value="Generate Caption") |
|
with gr.Column(scale=1): |
|
text = gr.Textbox(label="Caption", placeholder="Caption will appear here") |
|
with gr.Accordion("Settings", open=True): |
|
with gr.Row(): |
|
max_length = gr.Slider( |
|
label="Max Length", minimum=10, maximum=100, value=20, step=1 |
|
) |
|
min_length = gr.Slider( |
|
label="Min Length", minimum=1, maximum=10, value=10, step=1 |
|
) |
|
beam_size = gr.Slider(label="Beam size", minimum=1, maximum=8, value=8, step=1) |
|
througputs = gr.Radio( |
|
label="througputs", choices=[1, 2, 3], value=1 |
|
) |
|
|
|
def generate_caption(video, max_length, min_length, beam_size, througputs): |
|
|
|
container = VideoReader(video) |
|
clip_len = model.config.encoder.num_frames |
|
frames = container.get_batch( |
|
range(0, len(container), len(container) // (througputs * clip_len)) |
|
).asnumpy() |
|
frames = [frame for frame in frames[:-1]] |
|
|
|
|
|
|
|
gen_kwargs = { |
|
"min_length": min_length, |
|
"max_length": max_length, |
|
"num_beams": beam_size, |
|
} |
|
pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to( |
|
device |
|
) |
|
tokens = model.generate(pixel_values, **gen_kwargs) |
|
caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0] |
|
return caption |
|
|
|
generate.click( |
|
generate_caption, |
|
inputs=[video, max_length, min_length, beam_size, througputs], |
|
outputs=text, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|