Spaces:
Paused
Paused
#!/usr/bin/env python3 | |
""" | |
Copyright (c) 2022, salesforce.com, inc. | |
All rights reserved. | |
SPDX-License-Identifier: BSD-3-Clause | |
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause | |
""" | |
import numbers | |
import random | |
from torchvision.transforms import ( | |
RandomCrop, | |
RandomResizedCrop, | |
) | |
import video_llama.processors.functional_video as F | |
__all__ = [ | |
"RandomCropVideo", | |
"RandomResizedCropVideo", | |
"CenterCropVideo", | |
"NormalizeVideo", | |
"ToTensorVideo", | |
"RandomHorizontalFlipVideo", | |
] | |
class RandomCropVideo(RandomCrop): | |
def __init__(self, size): | |
if isinstance(size, numbers.Number): | |
self.size = (int(size), int(size)) | |
else: | |
self.size = size | |
def __call__(self, clip): | |
""" | |
Args: | |
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) | |
Returns: | |
torch.tensor: randomly cropped/resized video clip. | |
size is (C, T, OH, OW) | |
""" | |
i, j, h, w = self.get_params(clip, self.size) | |
return F.crop(clip, i, j, h, w) | |
def __repr__(self) -> str: | |
return f"{self.__class__.__name__}(size={self.size})" | |
class RandomResizedCropVideo(RandomResizedCrop): | |
def __init__( | |
self, | |
size, | |
scale=(0.08, 1.0), | |
ratio=(3.0 / 4.0, 4.0 / 3.0), | |
interpolation_mode="bilinear", | |
): | |
if isinstance(size, tuple): | |
if len(size) != 2: | |
raise ValueError( | |
f"size should be tuple (height, width), instead got {size}" | |
) | |
self.size = size | |
else: | |
self.size = (size, size) | |
self.interpolation_mode = interpolation_mode | |
self.scale = scale | |
self.ratio = ratio | |
def __call__(self, clip): | |
""" | |
Args: | |
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) | |
Returns: | |
torch.tensor: randomly cropped/resized video clip. | |
size is (C, T, H, W) | |
""" | |
i, j, h, w = self.get_params(clip, self.scale, self.ratio) | |
return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) | |
def __repr__(self) -> str: | |
return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}, scale={self.scale}, ratio={self.ratio})" | |
class CenterCropVideo: | |
def __init__(self, crop_size): | |
if isinstance(crop_size, numbers.Number): | |
self.crop_size = (int(crop_size), int(crop_size)) | |
else: | |
self.crop_size = crop_size | |
def __call__(self, clip): | |
""" | |
Args: | |
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) | |
Returns: | |
torch.tensor: central cropping of video clip. Size is | |
(C, T, crop_size, crop_size) | |
""" | |
return F.center_crop(clip, self.crop_size) | |
def __repr__(self) -> str: | |
return f"{self.__class__.__name__}(crop_size={self.crop_size})" | |
class NormalizeVideo: | |
""" | |
Normalize the video clip by mean subtraction and division by standard deviation | |
Args: | |
mean (3-tuple): pixel RGB mean | |
std (3-tuple): pixel RGB standard deviation | |
inplace (boolean): whether do in-place normalization | |
""" | |
def __init__(self, mean, std, inplace=False): | |
self.mean = mean | |
self.std = std | |
self.inplace = inplace | |
def __call__(self, clip): | |
""" | |
Args: | |
clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W) | |
""" | |
return F.normalize(clip, self.mean, self.std, self.inplace) | |
def __repr__(self) -> str: | |
return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})" | |
class ToTensorVideo: | |
""" | |
Convert tensor data type from uint8 to float, divide value by 255.0 and | |
permute the dimensions of clip tensor | |
""" | |
def __init__(self): | |
pass | |
def __call__(self, clip): | |
""" | |
Args: | |
clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) | |
Return: | |
clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) | |
""" | |
return F.to_tensor(clip) | |
def __repr__(self) -> str: | |
return self.__class__.__name__ | |
class RandomHorizontalFlipVideo: | |
""" | |
Flip the video clip along the horizonal direction with a given probability | |
Args: | |
p (float): probability of the clip being flipped. Default value is 0.5 | |
""" | |
def __init__(self, p=0.5): | |
self.p = p | |
def __call__(self, clip): | |
""" | |
Args: | |
clip (torch.tensor): Size is (C, T, H, W) | |
Return: | |
clip (torch.tensor): Size is (C, T, H, W) | |
""" | |
if random.random() < self.p: | |
clip = F.hflip(clip) | |
return clip | |
def __repr__(self) -> str: | |
return f"{self.__class__.__name__}(p={self.p})" | |