Spaces:

Kaori1707
/

VideoCap

Sleeping

App Files Files Community

Kaori1707 commited on May 23

Commit

76b1888

•

1 Parent(s): 1e1b948

Add application file

Browse files

Files changed (2) hide show

app.py +71 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from decord import VideoReader
+import torch
+from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
+import gradio as gr
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load pretrained processor, tokenizer, and model
+image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+model = VisionEncoderDecoderModel.from_pretrained(
+    "Neleac/timesformer-gpt2-video-captioning"
+).to(device)
+with gr.Blocks() as demo:
+    demo.title = "Video Captioning"
+    gr.Markdown(
+        '<img src=file/assets/AISEEDlogo.png style="width: 20%; height: 20% "/> \n \
+                Video Captioning, demo by AISEED'
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            video = gr.Video(label="Upload Video", format="mp4")
+            generate = gr.Button(value="Generate Caption")
+        with gr.Column(scale=1):
+            text = gr.Textbox(label="Caption", placeholder="Caption will appear here")
+            with gr.Accordion("Settings", open=True):
+                with gr.Row():
+                    max_length = gr.Slider(
+                        label="Max Length", minimum=10, maximum=100, value=20, step=1
+                    )
+                    min_length = gr.Slider(
+                        label="Min Length", minimum=1, maximum=10, value=10, step=1
+                    )
+                beam_size = gr.Slider(label="Beam size", minimum=1, maximum=8, value=8, step=1)
+                througputs = gr.Radio(
+                    label="througputs", choices=[1, 2, 3], value=1
+                )
+        def generate_caption(video, max_length, min_length, beam_size, througputs):
+            # read video
+            container = VideoReader(video)
+            clip_len = model.config.encoder.num_frames
+            frames = container.get_batch(
+                range(0, len(container), len(container) // (througputs * clip_len))
+            ).asnumpy()
+            frames = [frame for frame in frames[:-1]]
+            # process frames
+            # generate caption
+            gen_kwargs = {
+                "min_length": min_length,
+                "max_length": max_length,
+                "num_beams": beam_size,
+            }
+            pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(
+                device
+            )
+            tokens = model.generate(pixel_values, **gen_kwargs)
+            caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
+            return caption
+        generate.click(
+            generate_caption,
+            inputs=[video, max_length, min_length, beam_size, througputs],
+            outputs=text,
+        )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ decord