Kaori1707 commited on
Commit
76b1888
1 Parent(s): 1e1b948

Add application file

Browse files
Files changed (2) hide show
  1. app.py +71 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from decord import VideoReader
2
+ import torch
3
+ from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
4
+ import gradio as gr
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ # load pretrained processor, tokenizer, and model
9
+ image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
10
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
11
+ model = VisionEncoderDecoderModel.from_pretrained(
12
+ "Neleac/timesformer-gpt2-video-captioning"
13
+ ).to(device)
14
+
15
+
16
+ with gr.Blocks() as demo:
17
+ demo.title = "Video Captioning"
18
+ gr.Markdown(
19
+ '<img src=file/assets/AISEEDlogo.png style="width: 20%; height: 20% "/> \n \
20
+ Video Captioning, demo by AISEED'
21
+ )
22
+ with gr.Row():
23
+ with gr.Column(scale=2):
24
+ video = gr.Video(label="Upload Video", format="mp4")
25
+ generate = gr.Button(value="Generate Caption")
26
+ with gr.Column(scale=1):
27
+ text = gr.Textbox(label="Caption", placeholder="Caption will appear here")
28
+ with gr.Accordion("Settings", open=True):
29
+ with gr.Row():
30
+ max_length = gr.Slider(
31
+ label="Max Length", minimum=10, maximum=100, value=20, step=1
32
+ )
33
+ min_length = gr.Slider(
34
+ label="Min Length", minimum=1, maximum=10, value=10, step=1
35
+ )
36
+ beam_size = gr.Slider(label="Beam size", minimum=1, maximum=8, value=8, step=1)
37
+ througputs = gr.Radio(
38
+ label="througputs", choices=[1, 2, 3], value=1
39
+ )
40
+
41
+ def generate_caption(video, max_length, min_length, beam_size, througputs):
42
+ # read video
43
+ container = VideoReader(video)
44
+ clip_len = model.config.encoder.num_frames
45
+ frames = container.get_batch(
46
+ range(0, len(container), len(container) // (througputs * clip_len))
47
+ ).asnumpy()
48
+ frames = [frame for frame in frames[:-1]]
49
+
50
+ # process frames
51
+ # generate caption
52
+ gen_kwargs = {
53
+ "min_length": min_length,
54
+ "max_length": max_length,
55
+ "num_beams": beam_size,
56
+ }
57
+ pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(
58
+ device
59
+ )
60
+ tokens = model.generate(pixel_values, **gen_kwargs)
61
+ caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
62
+ return caption
63
+
64
+ generate.click(
65
+ generate_caption,
66
+ inputs=[video, max_length, min_length, beam_size, througputs],
67
+ outputs=text,
68
+ )
69
+
70
+ if __name__ == "__main__":
71
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ decord