Ryan-Pham commited on
Commit
6445525
1 Parent(s): 1cb3514

Upload 94 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. config/__init__.py +21 -0
  3. config/__pycache__/__init__.cpython-310.pyc +0 -0
  4. config/__pycache__/__init__.cpython-37.pyc +0 -0
  5. config/__pycache__/dataset_config.cpython-310.pyc +0 -0
  6. config/__pycache__/dataset_config.cpython-37.pyc +0 -0
  7. config/__pycache__/yowo_v2_config.cpython-310.pyc +0 -0
  8. config/__pycache__/yowo_v2_config.cpython-37.pyc +0 -0
  9. config/categories_count_32s2fpsnew.json +1 -0
  10. config/dataset_config.py +94 -0
  11. config/yowo_v2_config.py +84 -0
  12. dataset/__init__.py +0 -0
  13. dataset/__pycache__/__init__.cpython-310.pyc +0 -0
  14. dataset/__pycache__/__init__.cpython-37.pyc +0 -0
  15. dataset/__pycache__/ava.cpython-310.pyc +0 -0
  16. dataset/__pycache__/ava.cpython-37.pyc +0 -0
  17. dataset/__pycache__/ava_helper.cpython-310.pyc +0 -0
  18. dataset/__pycache__/ava_helper.cpython-37.pyc +0 -0
  19. dataset/__pycache__/transforms.cpython-310.pyc +0 -0
  20. dataset/__pycache__/transforms.cpython-37.pyc +0 -0
  21. dataset/__pycache__/ucf_jhmdb.cpython-310.pyc +0 -0
  22. dataset/__pycache__/ucf_jhmdb.cpython-37.pyc +0 -0
  23. dataset/ava.py +300 -0
  24. dataset/ava_helper.py +231 -0
  25. dataset/transforms.py +176 -0
  26. dataset/ucf24_demo/v_Basketball_g01_c02.mp4 +0 -0
  27. dataset/ucf24_demo/v_Basketball_g07_c04.mp4 +0 -0
  28. dataset/ucf24_demo/v_Biking_g01_c01.mp4 +3 -0
  29. dataset/ucf24_demo/v_CliffDiving_g03_c01.mp4 +3 -0
  30. dataset/ucf24_demo/v_Fencing_g01_c06.mp4 +3 -0
  31. dataset/ucf24_demo/v_HorseRiding_g01_c03.mp4 +3 -0
  32. dataset/ucf24_demo/v_IceDancing_g02_c05.mp4 +3 -0
  33. dataset/ucf24_demo/v_SalsaSpin_g03_c01.mp4 +3 -0
  34. dataset/ucf24_demo/v_SkateBoarding_g02_c01.mp4 +3 -0
  35. dataset/ucf_jhmdb.py +311 -0
  36. evaluator/__init__.py +0 -0
  37. evaluator/__pycache__/__init__.cpython-310.pyc +0 -0
  38. evaluator/__pycache__/__init__.cpython-37.pyc +0 -0
  39. evaluator/__pycache__/ava_eval_helper.cpython-310.pyc +0 -0
  40. evaluator/__pycache__/ava_eval_helper.cpython-37.pyc +0 -0
  41. evaluator/__pycache__/ava_evaluator.cpython-310.pyc +0 -0
  42. evaluator/__pycache__/ava_evaluator.cpython-37.pyc +0 -0
  43. evaluator/__pycache__/cal_frame_mAP.cpython-310.pyc +0 -0
  44. evaluator/__pycache__/cal_frame_mAP.cpython-37.pyc +0 -0
  45. evaluator/__pycache__/cal_video_mAP.cpython-310.pyc +0 -0
  46. evaluator/__pycache__/cal_video_mAP.cpython-37.pyc +0 -0
  47. evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-310.pyc +0 -0
  48. evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-37.pyc +0 -0
  49. evaluator/__pycache__/utils.cpython-310.pyc +0 -0
  50. evaluator/__pycache__/utils.cpython-37.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset/ucf24_demo/v_Biking_g01_c01.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ dataset/ucf24_demo/v_CliffDiving_g03_c01.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ dataset/ucf24_demo/v_Fencing_g01_c06.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ dataset/ucf24_demo/v_HorseRiding_g01_c03.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ dataset/ucf24_demo/v_IceDancing_g02_c05.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ dataset/ucf24_demo/v_SalsaSpin_g03_c01.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ dataset/ucf24_demo/v_SkateBoarding_g02_c01.mp4 filter=lfs diff=lfs merge=lfs -text
config/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .dataset_config import dataset_config
2
+ from .yowo_v2_config import yowo_v2_config
3
+
4
+
5
+ def build_model_config(args):
6
+ print('==============================')
7
+ print('Model Config: {} '.format(args.version.upper()))
8
+
9
+ if 'yowo_v2_' in args.version:
10
+ m_cfg = yowo_v2_config[args.version]
11
+
12
+ return m_cfg
13
+
14
+
15
+ def build_dataset_config(args):
16
+ print('==============================')
17
+ print('Dataset Config: {} '.format(args.dataset.upper()))
18
+
19
+ d_cfg = dataset_config[args.dataset]
20
+
21
+ return d_cfg
config/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (697 Bytes). View file
 
config/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (710 Bytes). View file
 
config/__pycache__/dataset_config.cpython-310.pyc ADDED
Binary file (1.72 kB). View file
 
config/__pycache__/dataset_config.cpython-37.pyc ADDED
Binary file (1.54 kB). View file
 
config/__pycache__/yowo_v2_config.cpython-310.pyc ADDED
Binary file (846 Bytes). View file
 
config/__pycache__/yowo_v2_config.cpython-37.pyc ADDED
Binary file (836 Bytes). View file
 
config/categories_count_32s2fpsnew.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"fighter": 9420, "threatener": 3964, "victim": 9264, "outsider": 14812}
config/dataset_config.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset configuration
2
+
3
+
4
+ dataset_config = {
5
+ 'ucf24': {
6
+ # dataset
7
+ 'gt_folder': './evaluator/groundtruths_ucf_jhmdb/groundtruths_ucf/',
8
+ # input size
9
+ 'train_size': 224,
10
+ 'test_size': 224,
11
+ # transform
12
+ 'jitter': 0.2,
13
+ 'hue': 0.1,
14
+ 'saturation': 1.5,
15
+ 'exposure': 1.5,
16
+ 'sampling_rate': 1,
17
+ # cls label
18
+ 'multi_hot': False, # one hot
19
+ # optimizer
20
+ 'optimizer': 'adamw',
21
+ 'momentum': 0.9,
22
+ 'weight_decay': 5e-4,
23
+ # warmup strategy
24
+ 'warmup': 'linear',
25
+ 'warmup_factor': 0.00066667,
26
+ 'wp_iter': 500,
27
+ # class names
28
+ 'valid_num_classes': 24,
29
+ 'label_map': (
30
+ 'Basketball', 'BasketballDunk', 'Biking', 'CliffDiving',
31
+ 'CricketBowling', 'Diving', 'Fencing', 'FloorGymnastics',
32
+ 'GolfSwing', 'HorseRiding', 'IceDancing', 'LongJump',
33
+ 'PoleVault', 'RopeClimbing', 'SalsaSpin', 'SkateBoarding',
34
+ 'Skiing', 'Skijet', 'SoccerJuggling', 'Surfing',
35
+ 'TennisSwing', 'TrampolineJumping', 'VolleyballSpiking', 'WalkingWithDog'
36
+ ),
37
+ },
38
+
39
+ 'ava_v2.2':{
40
+ # dataset
41
+ 'frames_dir': 'frames/',
42
+ 'frame_list': 'frame_lists/',
43
+ 'annotation_dir': 'annotations/',
44
+ 'train_gt_box_list': 'train.csv',
45
+ 'val_gt_box_list': 'val.csv',
46
+ 'train_exclusion_file': 'ava_train_excluded_timestamps_v2.2.csv',
47
+ 'val_exclusion_file': 'ava_val_excluded_timestamps_v2.2.csv',
48
+ 'labelmap_file': 'ava_action_list_v2.1_for_activitynet_2018.pbtxt', # 'ava_v2.2/ava_action_list_v2.2.pbtxt',
49
+ 'class_ratio_file': 'categories_count_32s2fpsnew.json',
50
+ 'backup_dir': 'C:/Users/Administrator/Downloads/YOWOv2/backup_dir',
51
+ # input size
52
+ 'train_size': 224,
53
+ 'test_size': 224,
54
+ # transform
55
+ 'jitter': 0.2,
56
+ 'hue': 0.1,
57
+ 'saturation': 1.5,
58
+ 'exposure': 1.5,
59
+ 'sampling_rate': 1,
60
+ # cls label
61
+ 'multi_hot': True, # multi hot
62
+ # train config
63
+ 'optimizer': 'adamw',
64
+ 'momentum': 0.9,
65
+ 'weight_decay': 5e-4,
66
+ # warmup strategy
67
+ 'warmup': 'linear',
68
+ 'warmup_factor': 0.00066667,
69
+ 'wp_iter': 500,
70
+ # class names
71
+ 'valid_num_classes': 3,
72
+ 'label_map': ('bully', 'victim', 'outsider')
73
+ # 'valid_num_classes': 80,
74
+ # 'label_map': (
75
+ # 'bend/bow(at the waist)', 'crawl', 'crouch/kneel', 'dance', 'fall down', # 1-5
76
+ # 'get up', 'jump/leap', 'lie/sleep', 'martial art', 'run/jog', # 6-10
77
+ # 'sit', 'stand', 'swim', 'walk', 'answer phone', # 11-15
78
+ # 'brush teeth', 'carry/hold (an object)', 'catch (an object)', 'chop', 'climb (e.g. a mountain)', # 16-20
79
+ # 'clink glass', 'close (e.g., a door, a box)', 'cook', 'cut', 'dig', # 21-25
80
+ # 'dress/put on clothing', 'drink', 'drive (e.g., a car, a truck)', 'eat', 'enter', # 26-30
81
+ # 'exit', 'extract', 'fishing', 'hit (an object)', 'kick (an object)', # 31-35
82
+ # 'lift/pick up', 'listen (e.g., to music)', 'open (e.g., a window, a car door)', 'paint', 'play board game', # 36-40
83
+ # 'play musical instrument', 'play with pets', 'point to (an object)', 'press','pull (an object)', # 41-45
84
+ # 'push (an object)', 'put down', 'read', 'ride (e.g., a bike, a car, a horse)', 'row boat', # 46-50
85
+ # 'sail boat', 'shoot', 'shovel', 'smoke', 'stir', # 51-55
86
+ # 'take a photo', 'text on/look at a cellphone', 'throw', 'touch (an object)', 'turn (e.g., a screwdriver)', # 56-60
87
+ # 'watch (e.g., TV)', 'work on a computer', 'write', 'fight/hit (a person)', 'give/serve (an object) to (a person)', # 61-65
88
+ # 'grab (a person)', 'hand clap', 'hand shake', 'hand wave', 'hug (a person)', # 66-70
89
+ # 'kick (a person)', 'kiss (a person)', 'lift (a person)', 'listen to (a person)', 'play with kids', # 71-75
90
+ # 'push (another person)', 'sing to (e.g., self, a person, a group)', 'take (an object) from (a person)', # 76-78
91
+ # 'talk to (e.g., self, a person, a group)', 'watch (a person)' # 79-80
92
+ # ),
93
+ }
94
+ }
config/yowo_v2_config.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model configuration
2
+
3
+
4
+ yowo_v2_config = {
5
+ 'yowo_v2_nano': {
6
+ # backbone
7
+ ## 2D
8
+ 'backbone_2d': 'yolo_free_nano',
9
+ 'pretrained_2d': True,
10
+ 'stride': [8, 16, 32],
11
+ ## 3D
12
+ 'backbone_3d': 'shufflenetv2',
13
+ 'model_size': '1.0x',
14
+ 'pretrained_3d': True,
15
+ 'memory_momentum': 0.9,
16
+ # head
17
+ 'head_dim': 64,
18
+ 'head_norm': 'BN',
19
+ 'head_act': 'lrelu',
20
+ 'num_cls_heads': 2,
21
+ 'num_reg_heads': 2,
22
+ 'head_depthwise': True,
23
+ },
24
+
25
+ 'yowo_v2_tiny': {
26
+ # backbone
27
+ ## 2D
28
+ 'backbone_2d': 'yolo_free_tiny',
29
+ 'pretrained_2d': True,
30
+ 'stride': [8, 16, 32],
31
+ ## 3D
32
+ 'backbone_3d': 'shufflenetv2',
33
+ 'model_size': '2.0x',
34
+ 'pretrained_3d': True,
35
+ 'memory_momentum': 0.9,
36
+ # head
37
+ 'head_dim': 64,
38
+ 'head_norm': 'BN',
39
+ 'head_act': 'lrelu',
40
+ 'num_cls_heads': 2,
41
+ 'num_reg_heads': 2,
42
+ 'head_depthwise': False,
43
+ },
44
+
45
+ 'yowo_v2_medium': {
46
+ # backbone
47
+ ## 2D
48
+ 'backbone_2d': 'yolo_free_large',
49
+ 'pretrained_2d': True,
50
+ 'stride': [8, 16, 32],
51
+ ## 3D
52
+ 'backbone_3d': 'shufflenetv2',
53
+ 'model_size': '2.0x',
54
+ 'pretrained_3d': True,
55
+ 'memory_momentum': 0.9,
56
+ # head
57
+ 'head_dim': 128,
58
+ 'head_norm': 'BN',
59
+ 'head_act': 'silu',
60
+ 'num_cls_heads': 2,
61
+ 'num_reg_heads': 2,
62
+ 'head_depthwise': False,
63
+ },
64
+
65
+ 'yowo_v2_large': {
66
+ # backbone
67
+ ## 2D
68
+ 'backbone_2d': 'yolo_free_large',
69
+ 'pretrained_2d': True,
70
+ 'stride': [8, 16, 32],
71
+ ## 3D
72
+ 'backbone_3d': 'resnext101',
73
+ 'pretrained_3d': True,
74
+ 'memory_momentum': 0.9,
75
+ # head
76
+ 'head_dim': 256,
77
+ 'head_norm': 'BN',
78
+ 'head_act': 'silu',
79
+ 'num_cls_heads': 2,
80
+ 'num_reg_heads': 2,
81
+ 'head_depthwise': False,
82
+ },
83
+
84
+ }
dataset/__init__.py ADDED
File without changes
dataset/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (131 Bytes). View file
 
dataset/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (125 Bytes). View file
 
dataset/__pycache__/ava.cpython-310.pyc ADDED
Binary file (6.81 kB). View file
 
dataset/__pycache__/ava.cpython-37.pyc ADDED
Binary file (6.75 kB). View file
 
dataset/__pycache__/ava_helper.cpython-310.pyc ADDED
Binary file (5.98 kB). View file
 
dataset/__pycache__/ava_helper.cpython-37.pyc ADDED
Binary file (5.92 kB). View file
 
dataset/__pycache__/transforms.cpython-310.pyc ADDED
Binary file (5.23 kB). View file
 
dataset/__pycache__/transforms.cpython-37.pyc ADDED
Binary file (5.3 kB). View file
 
dataset/__pycache__/ucf_jhmdb.cpython-310.pyc ADDED
Binary file (6.49 kB). View file
 
dataset/__pycache__/ucf_jhmdb.cpython-37.pyc ADDED
Binary file (6.46 kB). View file
 
dataset/ava.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ # encoding: utf-8
3
+
4
+ import os
5
+ import numpy as np
6
+
7
+ import torch
8
+ from torch.utils.data import Dataset
9
+ from PIL import Image
10
+
11
+ try:
12
+ import ava_helper
13
+ except:
14
+ from . import ava_helper
15
+
16
+
17
+ # Dataset for AVA
18
+ class AVA_Dataset(Dataset):
19
+ def __init__(self,
20
+ cfg,
21
+ data_root,
22
+ is_train=False,
23
+ img_size=224,
24
+ transform=None,
25
+ len_clip=8,
26
+ sampling_rate=1):
27
+ self.num_classes = 3
28
+ self.data_root = data_root
29
+ self.frames_dir = os.path.join(data_root, cfg['frames_dir'])
30
+ self.frame_list = os.path.join(data_root, cfg['frame_list'])
31
+ self.annotation_dir = os.path.join(data_root, cfg['annotation_dir'])
32
+ self.labelmap_file = os.path.join(data_root, cfg['annotation_dir'], cfg['labelmap_file'])
33
+ if is_train:
34
+ self.gt_box_list = os.path.join(self.annotation_dir, cfg['train_gt_box_list'])
35
+ self.exclusion_file = os.path.join(self.annotation_dir, cfg['train_exclusion_file'])
36
+ else:
37
+ self.gt_box_list = os.path.join(self.annotation_dir, cfg['val_gt_box_list'])
38
+ self.exclusion_file = os.path.join(self.annotation_dir, cfg['val_exclusion_file'])
39
+
40
+ self.transform = transform
41
+ self.is_train = is_train
42
+
43
+ self.img_size = img_size
44
+ self.len_clip = len_clip
45
+ self.sampling_rate = sampling_rate
46
+ self.seq_len = self.len_clip * self.sampling_rate
47
+
48
+ # load ava data
49
+ self._load_data()
50
+
51
+
52
+ def _load_data(self):
53
+ # Loading frame paths.
54
+ (
55
+ self._image_paths,
56
+ self._video_idx_to_name,
57
+ ) = ava_helper.load_image_lists(
58
+ self.frames_dir,
59
+ self.frame_list,
60
+ self.is_train
61
+ )
62
+
63
+ # Loading annotations for boxes and labels.
64
+ # boxes_and_labels: {'<video_name>': {<frame_num>: a list of [box_i, box_i_labels]} }
65
+ boxes_and_labels = ava_helper.load_boxes_and_labels(
66
+ self.gt_box_list,
67
+ self.exclusion_file,
68
+ self.is_train,
69
+ full_test_on_val=False
70
+ )
71
+
72
+ assert len(boxes_and_labels) == len(self._image_paths)
73
+
74
+ # boxes_and_labels: a list of {<frame_num>: a list of [box_i, box_i_labels]}
75
+ boxes_and_labels = [
76
+ boxes_and_labels[self._video_idx_to_name[i]]
77
+ for i in range(len(self._image_paths))
78
+ ]
79
+
80
+ # Get indices of keyframes and corresponding boxes and labels.
81
+ # _keyframe_indices: [video_idx, sec_idx, sec, frame_index]
82
+ # _keyframe_boxes_and_labels: list[list[list]], outer is video_idx, middle is sec_idx,
83
+ # inner is a list of [box_i, box_i_labels]
84
+ (
85
+ self._keyframe_indices,
86
+ self._keyframe_boxes_and_labels,
87
+ ) = ava_helper.get_keyframe_data(boxes_and_labels)
88
+
89
+ # Calculate the number of used boxes.
90
+ self._num_boxes_used = ava_helper.get_num_boxes_used(
91
+ self._keyframe_indices, self._keyframe_boxes_and_labels
92
+ )
93
+
94
+ self._max_objs = ava_helper.get_max_objs(
95
+ self._keyframe_indices, self._keyframe_boxes_and_labels
96
+ )
97
+
98
+ print("=== AVA dataset summary ===")
99
+ print("Train: {}".format(self.is_train))
100
+ print("Number of videos: {}".format(len(self._image_paths)))
101
+ total_frames = sum(
102
+ len(video_img_paths) for video_img_paths in self._image_paths
103
+ )
104
+ print("Number of frames: {}".format(total_frames))
105
+ print("Number of key frames: {}".format(len(self)))
106
+ print("Number of boxes: {}.".format(self._num_boxes_used))
107
+
108
+
109
+ def __len__(self):
110
+ return len(self._keyframe_indices)
111
+
112
+
113
+ def get_sequence(self, center_idx, half_len, sample_rate, num_frames):
114
+ """
115
+ Sample frames among the corresponding clip.
116
+
117
+ Args:
118
+ center_idx (int): center frame idx for current clip
119
+ half_len (int): half of the clip length
120
+ sample_rate (int): sampling rate for sampling frames inside of the clip
121
+ num_frames (int): number of expected sampled frames
122
+
123
+ Returns:
124
+ seq (list): list of indexes of sampled frames in this clip.
125
+ """
126
+ # seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate))
127
+ seq = list(range(center_idx - half_len*2 + 1*sample_rate, center_idx+1*sample_rate, sample_rate))
128
+
129
+ for seq_idx in range(len(seq)):
130
+ if seq[seq_idx] < 0:
131
+ seq[seq_idx] = 0
132
+ elif seq[seq_idx] >= num_frames:
133
+ seq[seq_idx] = num_frames - 1
134
+ return seq
135
+
136
+
137
+ def get_frame_idx(self, latest_idx, sample_length, sample_rate, num_frames):
138
+ """
139
+ Sample frames among the corresponding clip. But see keyframe as the latest frame,
140
+ instead of viewing it in center
141
+ """
142
+ # seq = list(range(latest_idx - sample_length + 1, latest_idx + 1, sample_rate))
143
+ seq = list(range(latest_idx, latest_idx - sample_length, -sample_rate))
144
+ seq.reverse()
145
+ for seq_idx in range(len(seq)):
146
+ if seq[seq_idx] < 0:
147
+ seq[seq_idx] = 0
148
+ elif seq[seq_idx] >= num_frames:
149
+ seq[seq_idx] = num_frames - 1
150
+
151
+ return seq
152
+
153
+
154
+ def __getitem__(self, idx):
155
+ # load a data
156
+ frame_idx, video_clip, target = self.pull_item(idx)
157
+
158
+ return frame_idx, video_clip, target
159
+
160
+
161
+ def pull_item(self, idx):
162
+ # Get the frame idxs for current clip. We can use it as center or latest
163
+ video_idx, sec_idx, sec, frame_idx = self._keyframe_indices[idx]
164
+ clip_label_list = self._keyframe_boxes_and_labels[video_idx][sec_idx]
165
+
166
+ # check label list
167
+ assert len(clip_label_list) > 0
168
+ assert len(clip_label_list) <= self._max_objs
169
+
170
+ # get a sequence
171
+ seq = self.get_sequence(
172
+ frame_idx,
173
+ self.seq_len // 2,
174
+ self.sampling_rate,
175
+ num_frames=len(self._image_paths[video_idx]),
176
+ )
177
+ image_paths = [self._image_paths[video_idx][frame - 1] for frame in seq]
178
+ #print('video_idx: ', video_idx, 'frame_idx', frame_idx)
179
+ keyframe_info = self._image_paths[video_idx][frame_idx - 1]
180
+ #print(keyframe_info)
181
+
182
+ # load a video clip
183
+ video_clip = []
184
+ for img_path in image_paths:
185
+ frame = Image.open(img_path).convert('RGB')
186
+ video_clip.append(frame)
187
+ ow, oh = frame.width, frame.height
188
+
189
+ # Get boxes and labels for current clip.
190
+ boxes = []
191
+ labels = []
192
+ for box_labels in clip_label_list:
193
+ bbox = box_labels[0]
194
+ label = box_labels[1]
195
+ multi_hot_label = np.zeros(1 + self.num_classes)
196
+ multi_hot_label[..., label] = 1.0
197
+
198
+ boxes.append(bbox)
199
+ labels.append(multi_hot_label[..., 1:].tolist())
200
+
201
+ boxes = np.array(boxes).reshape(-1, 4)
202
+ # renormalize bbox
203
+ boxes[..., [0, 2]] *= ow
204
+ boxes[..., [1, 3]] *= oh
205
+ labels = np.array(labels).reshape(-1, self.num_classes)
206
+
207
+ # target: [N, 4 + C]
208
+ target = np.concatenate([boxes, labels], axis=-1)
209
+
210
+ # transform
211
+ video_clip, target = self.transform(video_clip, target)
212
+ # List [T, 3, H, W] -> [3, T, H, W]
213
+ video_clip = torch.stack(video_clip, dim=1)
214
+
215
+ # reformat target
216
+ target = {
217
+ 'boxes': target[:, :4].float(), # [N, 4]
218
+ 'labels': target[:, 4:].long(), # [N, C]
219
+ 'orig_size': [ow, oh],
220
+ 'video_idx': video_idx,
221
+ 'sec': sec,
222
+
223
+ }
224
+
225
+ return [video_idx, sec], video_clip, target
226
+
227
+
228
+
229
+ if __name__ == '__main__':
230
+ import cv2
231
+ from transforms import Augmentation, BaseTransform
232
+
233
+ is_train = False
234
+ img_size = 224
235
+ len_clip = 16
236
+ sampling_rate = 1
237
+ dataset_config = {
238
+ 'data_root': 'C:/Users/Administrator/Downloads/YOWOv2/data/clip32s_2fps_new',
239
+ 'frames_dir': 'frames/',
240
+ 'frame_list': 'frame_lists/',
241
+ 'annotation_dir': 'annotations/',
242
+ 'train_gt_box_list': 'train.csv',
243
+ 'val_gt_box_list': 'val.csv',
244
+ 'train_exclusion_file': 'ava_train_excluded_timestamps_v2.2.csv',
245
+ 'val_exclusion_file': 'ava_val_excluded_timestamps_v2.2.csv',
246
+ 'labelmap_file': 'ava_action_list_v2.1_for_activitynet_2018.pbtxt',
247
+ }
248
+ trans_config = {
249
+ 'jitter': 0.2,
250
+ 'hue': 0.1,
251
+ 'saturation': 1.5,
252
+ 'exposure': 1.5
253
+ }
254
+ train_transform = Augmentation(
255
+ img_size=img_size,
256
+ jitter=trans_config['jitter'],
257
+ saturation=trans_config['saturation'],
258
+ exposure=trans_config['exposure']
259
+ )
260
+ val_transform = BaseTransform(img_size=img_size)
261
+
262
+ train_dataset = AVA_Dataset(
263
+ cfg=dataset_config,
264
+ data_root=dataset_config['data_root'],
265
+ is_train=is_train,
266
+ img_size=img_size,
267
+ transform=train_transform,
268
+ len_clip=len_clip,
269
+ sampling_rate=sampling_rate
270
+ )
271
+
272
+ print(len(train_dataset))
273
+ for i in range(len(train_dataset)):
274
+ frame_id, video_clip, target = train_dataset[i]
275
+ key_frame = video_clip[:, -1, :, :]
276
+
277
+ # to numpy
278
+ key_frame = key_frame.permute(1, 2, 0).numpy()
279
+ key_frame = key_frame.astype(np.uint8)
280
+
281
+ # to BGR
282
+ key_frame = key_frame[..., (2, 1, 0)]
283
+ H, W, C = key_frame.shape
284
+
285
+ key_frame = key_frame.copy()
286
+ bboxes = target['boxes']
287
+ labels = target['labels']
288
+
289
+ for box, cls_id in zip(bboxes, labels):
290
+ x1, y1, x2, y2 = box
291
+ x1 = int(x1 * W)
292
+ y1 = int(y1 * H)
293
+ x2 = int(x2 * W)
294
+ y2 = int(y2 * H)
295
+ key_frame = cv2.rectangle(key_frame, (x1, y1), (x2, y2), (255, 0, 0))
296
+
297
+ # cv2 show
298
+ cv2.imshow('key frame', key_frame)
299
+ cv2.waitKey(0)
300
+
dataset/ava_helper.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3
+
4
+ import logging
5
+ import os
6
+ import csv
7
+ from collections import defaultdict
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+ FPS = 15
12
+ AVA_VALID_FRAMES = range(0, 64)
13
+ #AVA_VALID_FRAMES = range(902, 1799)
14
+
15
+
16
+ def make_image_key(video_id, timestamp):
17
+ """Returns a unique identifier for a video id & timestamp."""
18
+ return "%s,%04d" % (video_id, int(timestamp))
19
+
20
+
21
+ def read_exclusions(exclusions_file):
22
+ """Reads a CSV file of excluded timestamps.
23
+ Args:
24
+ exclusions_file: A file object containing a csv of video-id,timestamp.
25
+ Returns:
26
+ A set of strings containing excluded image keys, e.g. "aaaaaaaaaaa,0904",
27
+ or an empty set if exclusions file is None.
28
+ """
29
+ excluded = set()
30
+ if exclusions_file:
31
+ with open(exclusions_file, "r") as f:
32
+ reader = csv.reader(f)
33
+ for row in reader:
34
+ assert len(row) == 2, "Expected only 2 columns, got: " + row
35
+ excluded.add(make_image_key(row[0], row[1]))
36
+ return excluded
37
+
38
+
39
+ def load_image_lists(frames_dir, frame_list, is_train):
40
+ """
41
+ Loading image paths from corresponding files.
42
+
43
+ Args:
44
+ frames_dir (str): path to frames dir.
45
+ frame_list (str): path to frame list.
46
+ is_train (bool): if it is training dataset or not.
47
+
48
+ Returns:
49
+ image_paths (list[list]): a list of items. Each item (also a list)
50
+ corresponds to one video and contains the paths of images for
51
+ this video.
52
+ video_idx_to_name (list): a list which stores video names.
53
+ """
54
+ # frame_list_dir is /data3/ava/frame_lists/
55
+ # contains 'train.csv' and 'val.csv'
56
+ if is_train:
57
+ list_name = "train.csv"
58
+ else:
59
+ list_name = "val.csv"
60
+
61
+ list_filename = os.path.join(frame_list, list_name)
62
+
63
+ image_paths = defaultdict(list)
64
+ video_name_to_idx = {}
65
+ video_idx_to_name = []
66
+ with open(list_filename, "r") as f:
67
+ f.readline()
68
+ for line in f:
69
+ row = line.split()
70
+ row = row[0].split(',')
71
+ # The format of each row should follow:
72
+ # original_vido_id video_id frame_id path labels.
73
+ assert len(row) == 5
74
+ video_name = row[0]
75
+
76
+ if video_name not in video_name_to_idx:
77
+ idx = len(video_name_to_idx)
78
+ video_name_to_idx[video_name] = idx
79
+ video_idx_to_name.append(video_name)
80
+
81
+ data_key = video_name_to_idx[video_name]
82
+
83
+ image_paths[data_key].append(os.path.join(frames_dir, row[3]))
84
+
85
+ image_paths = [image_paths[i] for i in range(len(image_paths))]
86
+
87
+ print("Finished loading image paths from: {}".format(list_filename))
88
+
89
+ return image_paths, video_idx_to_name
90
+
91
+
92
+ def load_boxes_and_labels(gt_box_list, exclusion_file, is_train=False, full_test_on_val=False):
93
+ """
94
+ Loading boxes and labels from csv files.
95
+
96
+ Args:
97
+ cfg (CfgNode): config.
98
+ mode (str): 'train', 'val', or 'test' mode.
99
+ Returns:
100
+ all_boxes (dict): a dict which maps from `video_name` and
101
+ `frame_sec` to a list of `box`. Each `box` is a
102
+ [`box_coord`, `box_labels`] where `box_coord` is the
103
+ coordinates of box and 'box_labels` are the corresponding
104
+ labels for the box.
105
+ """
106
+ ann_filename = gt_box_list
107
+ all_boxes = {}
108
+ count = 0
109
+ unique_box_count = 0
110
+ excluded_keys = read_exclusions(exclusion_file)
111
+
112
+ with open(ann_filename, 'r') as f:
113
+ for line in f:
114
+ row = line.strip().split(',')
115
+
116
+ video_name, frame_sec = row[0], int(row[1])
117
+ key = "%s,%04d" % (video_name, frame_sec)
118
+ # if mode == 'train' and key in excluded_keys:
119
+ if key in excluded_keys:
120
+ print("Found {} to be excluded...".format(key))
121
+ continue
122
+
123
+ # Only select frame_sec % 4 = 0 samples for validation if not
124
+ # set FULL_TEST_ON_VAL (default False)
125
+ if not is_train and not full_test_on_val and frame_sec % 4 != 0:
126
+ continue
127
+ # Box with [x1, y1, x2, y2] with a range of [0, 1] as float
128
+ box_key = ",".join(row[2:6])
129
+ box = list(map(float, row[2:6]))
130
+ label = -1 if row[6] == "" else int(row[6])
131
+ if video_name not in all_boxes:
132
+ all_boxes[video_name] = {}
133
+ for sec in AVA_VALID_FRAMES:
134
+ all_boxes[video_name][sec] = {}
135
+ if box_key not in all_boxes[video_name][frame_sec]:
136
+ all_boxes[video_name][frame_sec][box_key] = [box, []]
137
+ unique_box_count += 1
138
+
139
+ all_boxes[video_name][frame_sec][box_key][1].append(label)
140
+ if label != -1:
141
+ count += 1
142
+
143
+ for video_name in all_boxes.keys():
144
+ for frame_sec in all_boxes[video_name].keys():
145
+ # Save in format of a list of [box_i, box_i_labels].
146
+ all_boxes[video_name][frame_sec] = list(
147
+ all_boxes[video_name][frame_sec].values()
148
+ )
149
+
150
+ print("Finished loading annotations from: %s" % ", ".join([ann_filename]))
151
+ print("Number of unique boxes: %d" % unique_box_count)
152
+ print("Number of annotations: %d" % count)
153
+
154
+ return all_boxes
155
+
156
+
157
+ def get_keyframe_data(boxes_and_labels):
158
+ """
159
+ Getting keyframe indices, boxes and labels in the dataset.
160
+
161
+ Args:
162
+ boxes_and_labels (list[dict]): a list which maps from video_idx to a dict.
163
+ Each dict `frame_sec` to a list of boxes and corresponding labels.
164
+
165
+ Returns:
166
+ keyframe_indices (list): a list of indices of the keyframes.
167
+ keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from
168
+ video_idx and sec_idx to a list of boxes and corresponding labels.
169
+ """
170
+
171
+ def sec_to_frame(sec):
172
+ """
173
+ Convert time index (in second) to frame index.
174
+ 0: 900
175
+ 30: 901
176
+ """
177
+ #return (sec - 900) * FPS
178
+ return sec+1
179
+
180
+ keyframe_indices = []
181
+ keyframe_boxes_and_labels = []
182
+ count = 0
183
+ for video_idx in range(len(boxes_and_labels)):
184
+ sec_idx = 0
185
+ keyframe_boxes_and_labels.append([])
186
+ for sec in boxes_and_labels[video_idx].keys():
187
+ if sec not in AVA_VALID_FRAMES:
188
+ continue
189
+
190
+ if len(boxes_and_labels[video_idx][sec]) > 0:
191
+ keyframe_indices.append(
192
+ (video_idx, sec_idx, sec, sec_to_frame(sec))
193
+ )
194
+ keyframe_boxes_and_labels[video_idx].append(
195
+ boxes_and_labels[video_idx][sec]
196
+ )
197
+ sec_idx += 1
198
+ count += 1
199
+ logger.info("%d keyframes used." % count)
200
+
201
+ return keyframe_indices, keyframe_boxes_and_labels
202
+
203
+
204
+ def get_num_boxes_used(keyframe_indices, keyframe_boxes_and_labels):
205
+ """
206
+ Get total number of used boxes.
207
+
208
+ Args:
209
+ keyframe_indices (list): a list of indices of the keyframes.
210
+ keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from
211
+ video_idx and sec_idx to a list of boxes and corresponding labels.
212
+
213
+ Returns:
214
+ count (int): total number of used boxes.
215
+ """
216
+
217
+ count = 0
218
+ for video_idx, sec_idx, _, _ in keyframe_indices:
219
+ count += len(keyframe_boxes_and_labels[video_idx][sec_idx])
220
+ return count
221
+
222
+
223
+ def get_max_objs(keyframe_indices, keyframe_boxes_and_labels):
224
+ # max_objs = 0
225
+ # for video_idx, sec_idx, _, _ in keyframe_indices:
226
+ # num_boxes = len(keyframe_boxes_and_labels[video_idx][sec_idx])
227
+ # if num_boxes > max_objs:
228
+ # max_objs = num_boxes
229
+
230
+ # return max_objs
231
+ return 25 #### MODIFICATION FOR NOW! TODO: FIX LATER!
dataset/transforms.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import numpy as np
3
+ import torch
4
+ import torchvision.transforms.functional as F
5
+ from PIL import Image
6
+
7
+
8
+ # Augmentation for Training
9
+ class Augmentation(object):
10
+ def __init__(self, img_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5):
11
+ self.img_size = img_size
12
+ self.jitter = jitter
13
+ self.hue = hue
14
+ self.saturation = saturation
15
+ self.exposure = exposure
16
+
17
+
18
+ def rand_scale(self, s):
19
+ scale = random.uniform(1, s)
20
+
21
+ if random.randint(0, 1):
22
+ return scale
23
+
24
+ return 1./scale
25
+
26
+
27
+ def random_distort_image(self, video_clip):
28
+ dhue = random.uniform(-self.hue, self.hue)
29
+ dsat = self.rand_scale(self.saturation)
30
+ dexp = self.rand_scale(self.exposure)
31
+
32
+ video_clip_ = []
33
+ for image in video_clip:
34
+ image = image.convert('HSV')
35
+ cs = list(image.split())
36
+ cs[1] = cs[1].point(lambda i: i * dsat)
37
+ cs[2] = cs[2].point(lambda i: i * dexp)
38
+
39
+ def change_hue(x):
40
+ x += dhue * 255
41
+ if x > 255:
42
+ x -= 255
43
+ if x < 0:
44
+ x += 255
45
+ return x
46
+
47
+ cs[0] = cs[0].point(change_hue)
48
+ image = Image.merge(image.mode, tuple(cs))
49
+
50
+ image = image.convert('RGB')
51
+
52
+ video_clip_.append(image)
53
+
54
+ return video_clip_
55
+
56
+
57
+ def random_crop(self, video_clip, width, height):
58
+ dw =int(width * self.jitter)
59
+ dh =int(height * self.jitter)
60
+
61
+ pleft = random.randint(-dw, dw)
62
+ pright = random.randint(-dw, dw)
63
+ ptop = random.randint(-dh, dh)
64
+ pbot = random.randint(-dh, dh)
65
+
66
+ swidth = width - pleft - pright
67
+ sheight = height - ptop - pbot
68
+
69
+ sx = float(swidth) / width
70
+ sy = float(sheight) / height
71
+
72
+ dx = (float(pleft) / width)/sx
73
+ dy = (float(ptop) / height)/sy
74
+
75
+ # random crop
76
+ cropped_clip = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in video_clip]
77
+
78
+ return cropped_clip, dx, dy, sx, sy
79
+
80
+
81
+ def apply_bbox(self, target, ow, oh, dx, dy, sx, sy):
82
+ sx, sy = 1./sx, 1./sy
83
+ # apply deltas on bbox
84
+ target[..., 0] = np.minimum(0.999, np.maximum(0, target[..., 0] / ow * sx - dx))
85
+ target[..., 1] = np.minimum(0.999, np.maximum(0, target[..., 1] / oh * sy - dy))
86
+ target[..., 2] = np.minimum(0.999, np.maximum(0, target[..., 2] / ow * sx - dx))
87
+ target[..., 3] = np.minimum(0.999, np.maximum(0, target[..., 3] / oh * sy - dy))
88
+
89
+ # refine target
90
+ refine_target = []
91
+ for i in range(target.shape[0]):
92
+ tgt = target[i]
93
+ bw = (tgt[2] - tgt[0]) * ow
94
+ bh = (tgt[3] - tgt[1]) * oh
95
+
96
+ if bw < 1. or bh < 1.:
97
+ continue
98
+
99
+ refine_target.append(tgt)
100
+
101
+ refine_target = np.array(refine_target).reshape(-1, target.shape[-1])
102
+
103
+ return refine_target
104
+
105
+
106
+ def to_tensor(self, video_clip):
107
+ return [F.to_tensor(image) * 255. for image in video_clip]
108
+
109
+
110
+ def __call__(self, video_clip, target):
111
+ # Initialize Random Variables
112
+ oh = video_clip[0].height
113
+ ow = video_clip[0].width
114
+
115
+ # random crop
116
+ video_clip, dx, dy, sx, sy = self.random_crop(video_clip, ow, oh)
117
+
118
+ # resize
119
+ video_clip = [img.resize([self.img_size, self.img_size]) for img in video_clip]
120
+
121
+ # random flip
122
+ flip = random.randint(0, 1)
123
+ if flip:
124
+ video_clip = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in video_clip]
125
+
126
+ # distort
127
+ video_clip = self.random_distort_image(video_clip)
128
+
129
+ # process target
130
+ if target is not None:
131
+ target = self.apply_bbox(target, ow, oh, dx, dy, sx, sy)
132
+ if flip:
133
+ target[..., [0, 2]] = 1.0 - target[..., [2, 0]]
134
+ else:
135
+ target = np.array([])
136
+
137
+ # to tensor
138
+ video_clip = self.to_tensor(video_clip)
139
+ target = torch.as_tensor(target).float()
140
+
141
+ return video_clip, target
142
+
143
+
144
+ # Transform for Testing
145
+ class BaseTransform(object):
146
+ def __init__(self, img_size=224, ):
147
+ self.img_size = img_size
148
+
149
+
150
+ def to_tensor(self, video_clip):
151
+ return [F.to_tensor(image) * 255. for image in video_clip]
152
+
153
+
154
+ def __call__(self, video_clip, target=None, normalize=True):
155
+ oh = video_clip[0].height
156
+ ow = video_clip[0].width
157
+
158
+ # resize
159
+ video_clip = [img.resize([self.img_size, self.img_size]) for img in video_clip]
160
+
161
+ # normalize target
162
+ # if target is not None:
163
+ # if normalize:
164
+ # target[..., [0, 2]] /= ow
165
+ # target[..., [1, 3]] /= oh
166
+
167
+ # else:
168
+ # target = np.array([])
169
+
170
+ # to tensor
171
+ video_clip = self.to_tensor(video_clip)
172
+ #target = torch.as_tensor(target).float()
173
+
174
+ #return video_clip, target
175
+ return video_clip
176
+
dataset/ucf24_demo/v_Basketball_g01_c02.mp4 ADDED
Binary file (514 kB). View file
 
dataset/ucf24_demo/v_Basketball_g07_c04.mp4 ADDED
Binary file (829 kB). View file
 
dataset/ucf24_demo/v_Biking_g01_c01.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069914984bf53b5dbf4b24fbf7d79288f3697a35e494635b9ab48e3c800aea59
3
+ size 1703798
dataset/ucf24_demo/v_CliffDiving_g03_c01.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8975e99f1199731d8d55eacb6e0d633275618e6b2225c26e64e1e42396beb47
3
+ size 1024051
dataset/ucf24_demo/v_Fencing_g01_c06.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:529c6738bc866a1ef2c3a14fb8e4538c91d3344298bee2d0714dca859099cc5d
3
+ size 1403751
dataset/ucf24_demo/v_HorseRiding_g01_c03.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e33ea8d44b15086ceb7f905f9c059d97d8518dedb49eff4718f64c746d463c1e
3
+ size 1527353
dataset/ucf24_demo/v_IceDancing_g02_c05.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bb209ed584117352e8f6d9d0cd5587aaae0f8e23b323f11ff18dbf6fe179388
3
+ size 1503889
dataset/ucf24_demo/v_SalsaSpin_g03_c01.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:776bc11f53b18f8ec4a9b5bbbf9c01c7d95c7afce5dda13aec01dd1c9749a8e0
3
+ size 1477281
dataset/ucf24_demo/v_SkateBoarding_g02_c01.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c428046991043c8fb4736d27356a3b2ea59a512d17fc821ed5333a7058ecfc7
3
+ size 3293243
dataset/ucf_jhmdb.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ # encoding: utf-8
3
+
4
+ import os
5
+ import random
6
+ import numpy as np
7
+ import glob
8
+
9
+ import torch
10
+ from torch.utils.data import Dataset
11
+ from PIL import Image
12
+
13
+
14
+ # Dataset for UCF24 & JHMDB
15
+ class UCF_JHMDB_Dataset(Dataset):
16
+ def __init__(self,
17
+ data_root,
18
+ dataset='ucf24',
19
+ img_size=224,
20
+ transform=None,
21
+ is_train=False,
22
+ len_clip=16,
23
+ sampling_rate=1):
24
+ self.data_root = data_root
25
+ self.dataset = dataset
26
+ self.transform = transform
27
+ self.is_train = is_train
28
+
29
+ self.img_size = img_size
30
+ self.len_clip = len_clip
31
+ self.sampling_rate = sampling_rate
32
+
33
+ if self.is_train:
34
+ self.split_list = 'trainlist.txt'
35
+ else:
36
+ self.split_list = 'testlist.txt'
37
+
38
+ # load data
39
+ with open(os.path.join(data_root, self.split_list), 'r') as file:
40
+ self.file_names = file.readlines()
41
+ self.num_samples = len(self.file_names)
42
+
43
+ if dataset == 'ucf24':
44
+ self.num_classes = 24
45
+ elif dataset == 'jhmdb21':
46
+ self.num_classes = 21
47
+
48
+
49
+ def __len__(self):
50
+ return self.num_samples
51
+
52
+
53
+ def __getitem__(self, index):
54
+ # load a data
55
+ frame_idx, video_clip, target = self.pull_item(index)
56
+
57
+ return frame_idx, video_clip, target
58
+
59
+
60
+ def pull_item(self, index):
61
+ """ load a data """
62
+ assert index <= len(self), 'index range error'
63
+ image_path = self.file_names[index].rstrip()
64
+
65
+ img_split = image_path.split('/') # ex. ['labels', 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
66
+ # image name
67
+ img_id = int(img_split[-1][:5])
68
+
69
+ # path to label
70
+ label_path = os.path.join(self.data_root, img_split[0], img_split[1], img_split[2], '{:05d}.txt'.format(img_id))
71
+
72
+ # image folder
73
+ img_folder = os.path.join(self.data_root, 'rgb-images', img_split[1], img_split[2])
74
+
75
+ # frame numbers
76
+ if self.dataset == 'ucf24':
77
+ max_num = len(os.listdir(img_folder))
78
+ elif self.dataset == 'jhmdb21':
79
+ max_num = len(os.listdir(img_folder)) - 1
80
+
81
+ # sampling rate
82
+ if self.is_train:
83
+ d = random.randint(1, 2)
84
+ else:
85
+ d = self.sampling_rate
86
+
87
+ # load images
88
+ video_clip = []
89
+ for i in reversed(range(self.len_clip)):
90
+ # make it as a loop
91
+ img_id_temp = img_id - i * d
92
+ if img_id_temp < 1:
93
+ img_id_temp = 1
94
+ elif img_id_temp > max_num:
95
+ img_id_temp = max_num
96
+
97
+ # load a frame
98
+ if self.dataset == 'ucf24':
99
+ path_tmp = os.path.join(self.data_root, 'rgb-images', img_split[1], img_split[2] ,'{:05d}.jpg'.format(img_id_temp))
100
+ elif self.dataset == 'jhmdb21':
101
+ path_tmp = os.path.join(self.data_root, 'rgb-images', img_split[1], img_split[2] ,'{:05d}.png'.format(img_id_temp))
102
+ frame = Image.open(path_tmp).convert('RGB')
103
+ ow, oh = frame.width, frame.height
104
+
105
+ video_clip.append(frame)
106
+
107
+ frame_id = img_split[1] + '_' +img_split[2] + '_' + img_split[3]
108
+
109
+ # load an annotation
110
+ if os.path.getsize(label_path):
111
+ target = np.loadtxt(label_path)
112
+ else:
113
+ target = None
114
+
115
+ # [label, x1, y1, x2, y2] -> [x1, y1, x2, y2, label]
116
+ label = target[..., :1]
117
+ boxes = target[..., 1:]
118
+ target = np.concatenate([boxes, label], axis=-1).reshape(-1, 5)
119
+
120
+ # transform
121
+ video_clip, target = self.transform(video_clip, target)
122
+ # List [T, 3, H, W] -> [3, T, H, W]
123
+ video_clip = torch.stack(video_clip, dim=1)
124
+
125
+ # reformat target
126
+ target = {
127
+ 'boxes': target[:, :4].float(), # [N, 4]
128
+ 'labels': target[:, -1].long() - 1, # [N,]
129
+ 'orig_size': [ow, oh],
130
+ 'video_idx':frame_id[:-10]
131
+ }
132
+
133
+ return frame_id, video_clip, target
134
+
135
+
136
+ def pull_anno(self, index):
137
+ """ load a data """
138
+ assert index <= len(self), 'index range error'
139
+ image_path = self.file_names[index].rstrip()
140
+
141
+ img_split = image_path.split('/') # ex. ['labels', 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
142
+ # image name
143
+ img_id = int(img_split[-1][:5])
144
+
145
+ # path to label
146
+ label_path = os.path.join(self.data_root, img_split[0], img_split[1], img_split[2], '{:05d}.txt'.format(img_id))
147
+
148
+ # load an annotation
149
+ target = np.loadtxt(label_path)
150
+ target = target.reshape(-1, 5)
151
+
152
+ return target
153
+
154
+
155
+ # Video Dataset for UCF24 & JHMDB
156
+ class UCF_JHMDB_VIDEO_Dataset(Dataset):
157
+ def __init__(self,
158
+ data_root,
159
+ dataset='ucf24',
160
+ img_size=224,
161
+ transform=None,
162
+ len_clip=16,
163
+ sampling_rate=1):
164
+ self.data_root = data_root
165
+ self.dataset = dataset
166
+ self.transform = transform
167
+
168
+ self.img_size = img_size
169
+ self.len_clip = len_clip
170
+ self.sampling_rate = sampling_rate
171
+
172
+ if dataset == 'ucf24':
173
+ self.num_classes = 24
174
+ elif dataset == 'jhmdb21':
175
+ self.num_classes = 21
176
+
177
+
178
+ def set_video_data(self, line):
179
+ self.line = line
180
+
181
+ # load a video
182
+ self.img_folder = os.path.join(self.data_root, 'rgb-images', self.line)
183
+
184
+ if self.dataset == 'ucf24':
185
+ self.label_paths = sorted(glob.glob(os.path.join(self.img_folder, '*.jpg')))
186
+ elif self.dataset == 'jhmdb21':
187
+ self.label_paths = sorted(glob.glob(os.path.join(self.img_folder, '*.png')))
188
+
189
+
190
+ def __len__(self):
191
+ return len(self.label_paths)
192
+
193
+
194
+ def __getitem__(self, index):
195
+ return self.pull_item(index)
196
+
197
+
198
+ def pull_item(self, index):
199
+ image_path = self.label_paths[index]
200
+
201
+ video_split = self.line.split('/')
202
+ video_class = video_split[0]
203
+ video_file = video_split[1]
204
+ # for windows:
205
+ # img_split = image_path.split('\\') # ex. [..., 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
206
+ # for linux
207
+ img_split = image_path.split('/') # ex. [..., 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
208
+
209
+ # image name
210
+ img_id = int(img_split[-1][:5])
211
+ max_num = len(os.listdir(self.img_folder))
212
+ if self.dataset == 'ucf24':
213
+ img_name = os.path.join(video_class, video_file, '{:05d}.jpg'.format(img_id))
214
+ elif self.dataset == 'jhmdb21':
215
+ img_name = os.path.join(video_class, video_file, '{:05d}.png'.format(img_id))
216
+
217
+ # load video clip
218
+ video_clip = []
219
+ for i in reversed(range(self.len_clip)):
220
+ # make it as a loop
221
+ img_id_temp = img_id - i
222
+ if img_id_temp < 1:
223
+ img_id_temp = 1
224
+ elif img_id_temp > max_num:
225
+ img_id_temp = max_num
226
+
227
+ # load a frame
228
+ if self.dataset == 'ucf24':
229
+ path_tmp = os.path.join(self.data_root, 'rgb-images', video_class, video_file ,'{:05d}.jpg'.format(img_id_temp))
230
+ elif self.dataset == 'jhmdb21':
231
+ path_tmp = os.path.join(self.data_root, 'rgb-images', video_class, video_file ,'{:05d}.png'.format(img_id_temp))
232
+ frame = Image.open(path_tmp).convert('RGB')
233
+ ow, oh = frame.width, frame.height
234
+
235
+ video_clip.append(frame)
236
+
237
+ # transform
238
+ video_clip, _ = self.transform(video_clip, normalize=False)
239
+ # List [T, 3, H, W] -> [3, T, H, W]
240
+ video_clip = torch.stack(video_clip, dim=1)
241
+ orig_size = [ow, oh] # width, height
242
+
243
+ target = {'orig_size': [ow, oh]}
244
+
245
+ return img_name, video_clip, target
246
+
247
+
248
+
249
+
250
+ if __name__ == '__main__':
251
+ import cv2
252
+ from transforms import Augmentation, BaseTransform
253
+
254
+ data_root = 'D:/python_work/spatial-temporal_action_detection/dataset/ucf24'
255
+ dataset = 'ucf24'
256
+ is_train = True
257
+ img_size = 224
258
+ len_clip = 16
259
+ trans_config = {
260
+ 'jitter': 0.2,
261
+ 'hue': 0.1,
262
+ 'saturation': 1.5,
263
+ 'exposure': 1.5
264
+ }
265
+ train_transform = Augmentation(
266
+ img_size=img_size,
267
+ jitter=trans_config['jitter'],
268
+ saturation=trans_config['saturation'],
269
+ exposure=trans_config['exposure']
270
+ )
271
+ val_transform = BaseTransform(img_size=img_size)
272
+
273
+ train_dataset = UCF_JHMDB_Dataset(
274
+ data_root=data_root,
275
+ dataset=dataset,
276
+ img_size=img_size,
277
+ transform=train_transform,
278
+ is_train=is_train,
279
+ len_clip=len_clip,
280
+ sampling_rate=1
281
+ )
282
+
283
+ print(len(train_dataset))
284
+ for i in range(len(train_dataset)):
285
+ frame_id, video_clip, target = train_dataset[i]
286
+ key_frame = video_clip[:, -1, :, :]
287
+
288
+ # to numpy
289
+ key_frame = key_frame.permute(1, 2, 0).numpy()
290
+ key_frame = key_frame.astype(np.uint8)
291
+
292
+ # to BGR
293
+ key_frame = key_frame[..., (2, 1, 0)]
294
+ H, W, C = key_frame.shape
295
+
296
+ key_frame = key_frame.copy()
297
+ bboxes = target['boxes']
298
+ labels = target['labels']
299
+
300
+ for box, cls_id in zip(bboxes, labels):
301
+ x1, y1, x2, y2 = box
302
+ x1 = int(x1 * W)
303
+ y1 = int(y1 * H)
304
+ x2 = int(x2 * W)
305
+ y2 = int(y2 * H)
306
+ key_frame = cv2.rectangle(key_frame, (x1, y1), (x2, y2), (255, 0, 0))
307
+
308
+ # cv2 show
309
+ cv2.imshow('key frame', key_frame)
310
+ cv2.waitKey(0)
311
+
evaluator/__init__.py ADDED
File without changes
evaluator/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (133 Bytes). View file
 
evaluator/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (127 Bytes). View file
 
evaluator/__pycache__/ava_eval_helper.cpython-310.pyc ADDED
Binary file (7.23 kB). View file
 
evaluator/__pycache__/ava_eval_helper.cpython-37.pyc ADDED
Binary file (7.07 kB). View file
 
evaluator/__pycache__/ava_evaluator.cpython-310.pyc ADDED
Binary file (6.75 kB). View file
 
evaluator/__pycache__/ava_evaluator.cpython-37.pyc ADDED
Binary file (6.67 kB). View file
 
evaluator/__pycache__/cal_frame_mAP.cpython-310.pyc ADDED
Binary file (26.6 kB). View file
 
evaluator/__pycache__/cal_frame_mAP.cpython-37.pyc ADDED
Binary file (27.3 kB). View file
 
evaluator/__pycache__/cal_video_mAP.cpython-310.pyc ADDED
Binary file (7.81 kB). View file
 
evaluator/__pycache__/cal_video_mAP.cpython-37.pyc ADDED
Binary file (8.03 kB). View file
 
evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-310.pyc ADDED
Binary file (5.99 kB). View file
 
evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-37.pyc ADDED
Binary file (5.89 kB). View file
 
evaluator/__pycache__/utils.cpython-310.pyc ADDED
Binary file (3.82 kB). View file
 
evaluator/__pycache__/utils.cpython-37.pyc ADDED
Binary file (3.82 kB). View file