videomae-finetuned-nba-5-class

Sleeping

App Files Files Community

omermazig commited on Oct 17, 2023

Commit

e0d185f

•

1 Parent(s): 04b5589

Change app code to use my own:

Browse files

* model
* transformations
* inference method (multiple clips)

Files changed (1) hide show

app.py +65 -98

app.py CHANGED Viewed

@@ -1,119 +1,86 @@
-import cv2
 import gradio as gr
-import imutils
-import numpy as np
 import torch
 from pytorchvideo.transforms import (
-    ApplyTransformToKey,
     Normalize,
-    RandomShortSideScale,
-    RemoveKey,
-    ShortSideScale,
-    UniformTemporalSubsample,
 )
 from torchvision.transforms import (
     Compose,
     Lambda,
-    RandomCrop,
-    RandomHorizontalFlip,
-    Resize,
 )
-from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
-MODEL_CKPT = "sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset"
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
-PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
-RESIZE_TO = PROCESSOR.size["shortest_edge"]
-NUM_FRAMES_TO_SAMPLE = MODEL.config.num_frames
-IMAGE_STATS = {"image_mean": [0.485, 0.456, 0.406], "image_std": [0.229, 0.224, 0.225]}
-VAL_TRANSFORMS = Compose(
-    [
-        UniformTemporalSubsample(NUM_FRAMES_TO_SAMPLE),
-        Lambda(lambda x: x / 255.0),
-        Normalize(IMAGE_STATS["image_mean"], IMAGE_STATS["image_std"]),
-        Resize((RESIZE_TO, RESIZE_TO)),
-    ]
-)
-LABELS = list(MODEL.config.label2id.keys())
-def parse_video(video_file):
-    """A utility to parse the input videos.
-    Reference: https://pyimagesearch.com/2018/11/12/yolo-object-detection-with-opencv/
-    """
-    vs = cv2.VideoCapture(video_file)
-    # try to determine the total number of frames in the video file
-    try:
-        prop = (
-            cv2.cv.CV_CAP_PROP_FRAME_COUNT
-            if imutils.is_cv2()
-            else cv2.CAP_PROP_FRAME_COUNT
-        )
-        total = int(vs.get(prop))
-        print("[INFO] {} total frames in video".format(total))
-    # an error occurred while trying to determine the total
-    # number of frames in the video file
-    except:
-        print("[INFO] could not determine # of frames in video")
-        print("[INFO] no approx. completion time can be provided")
-        total = -1
-    frames = []
-    # loop over frames from the video file stream
-    while True:
-        # read the next frame from the file
-        (grabbed, frame) = vs.read()
-        if frame is not None:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frames.append(frame)
-        # if the frame was not grabbed, then we have reached the end
-        # of the stream
-        if not grabbed:
-            break
-    return frames
-def preprocess_video(frames: list):
-    """Utility to apply preprocessing transformations to a video tensor."""
-    # Each frame in the `frames` list has the shape: (height, width, num_channels).
-    # Collated together the `frames` has the the shape: (num_frames, height, width, num_channels).
-    # So, after converting the `frames` list to a torch tensor, we permute the shape
-    # such that it becomes (num_channels, num_frames, height, width) to make
-    # the shape compatible with the preprocessing transformations. After applying the
-    # preprocessing chain, we permute the shape to (num_frames, num_channels, height, width)
-    # to make it compatible with the model. Finally, we add a batch dimension so that our video
-    # classification model can operate on it.
-    video_tensor = torch.tensor(np.array(frames).astype(frames[0].dtype))
-    video_tensor = video_tensor.permute(
-        3, 0, 1, 2
-    )  # (num_channels, num_frames, height, width)
-    video_tensor_pp = VAL_TRANSFORMS(video_tensor)
-    video_tensor_pp = video_tensor_pp.permute(
-        1, 0, 2, 3
-    )  # (num_frames, num_channels, height, width)
-    video_tensor_pp = video_tensor_pp.unsqueeze(0)
-    return video_tensor_pp.to(DEVICE)
 def infer(video_file):
-    frames = parse_video(video_file)
-    video_tensor = preprocess_video(frames)
-    inputs = {"pixel_values": video_tensor}
     # forward pass
     with torch.no_grad():
-        outputs = MODEL(**inputs)
-        logits = outputs.logits
     softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
-    confidences = {LABELS[i]: float(softmax_scores[i]) for i in range(len(LABELS))}
     return confidences

 import gradio as gr
 import torch
+from pytorchvideo.data import make_clip_sampler
+from pytorchvideo.data.clip_sampling import ClipInfoList
+from pytorchvideo.data.encoded_video_pyav import EncodedVideoPyAV
+from pytorchvideo.data.video import VideoPathHandler
 from pytorchvideo.transforms import (
     Normalize,
+    UniformTemporalSubsample, RandomShortSideScale,
 )
 from torchvision.transforms import (
     Compose,
     Lambda,
+    Resize, RandomCrop,
 )
+from transformers import pipeline
+MODEL_CKPT = "omermazig/videomae-finetuned-nba-5-class-4-batch-8000-vid-multiclass"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+CLIPS_FROM_SINGLE_VIDEO = 5
+pipe = pipeline("video-classification", model=MODEL_CKPT)
+trained_model = pipe.model
+image_processor = pipe.image_processor
+mean = image_processor.image_mean
+std = image_processor.image_std
+if "shortest_edge" in image_processor.size:
+    height = width = image_processor.size["shortest_edge"]
+else:
+    height = image_processor.size["height"]
+    width = image_processor.size["width"]
+resize_to = (height, width)
+num_frames_to_sample = trained_model.config.num_frames
+sample_rate = 4
+fps = 30
+clip_duration = num_frames_to_sample * sample_rate / fps
+# Validation and Test datasets' transformations.
+inference_transform = Compose(
+                [
+                    UniformTemporalSubsample(num_frames_to_sample),
+                    Lambda(lambda x: x / 255.0),
+                    Normalize(mean, std),
+                    RandomShortSideScale(min_size=256, max_size=320),
+                    RandomCrop(resize_to),
+                ]
+            )
+labels = list(trained_model.config.label2id.keys())
+def parse_video_to_clips(video_file):
+    """A utility to parse the input videos """
+    video_path_handler = VideoPathHandler()
+    video: EncodedVideoPyAV = video_path_handler.video_from_path(video_file)
+    clip_sampler = make_clip_sampler("random_multi", clip_duration, CLIPS_FROM_SINGLE_VIDEO)
+    # noinspection PyTypeChecker
+    clip_info: ClipInfoList = clip_sampler(0, video.duration, {})
+    video_clips_list = []
+    for clip_start, clip_end in zip(clip_info.clip_start_sec, clip_info.clip_end_sec):
+        video_clip = video.get_clip(clip_start, clip_end)["video"]
+        video_clips_list.append(inference_transform(video_clip))
+    videos_tensor = torch.stack([single_clip.permute(1, 0, 2, 3) for single_clip in video_clips_list])
+    return videos_tensor
 def infer(video_file):
+    videos_tensor = parse_video_to_clips(video_file)
+    inputs = {"pixel_values": videos_tensor}
     # forward pass
     with torch.no_grad():
+        outputs = trained_model(**inputs)
+        multiple_logits = outputs.logits
+        logits = multiple_logits.sum(dim=0)
     softmax_scores = torch.nn.functional.softmax(logits, dim=-1).squeeze(0)
+    confidences = {labels[i]: float(softmax_scores[i]) for i in range(len(labels))}
     return confidences