Upload 94 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +7 -0
- config/__init__.py +21 -0
- config/__pycache__/__init__.cpython-310.pyc +0 -0
- config/__pycache__/__init__.cpython-37.pyc +0 -0
- config/__pycache__/dataset_config.cpython-310.pyc +0 -0
- config/__pycache__/dataset_config.cpython-37.pyc +0 -0
- config/__pycache__/yowo_v2_config.cpython-310.pyc +0 -0
- config/__pycache__/yowo_v2_config.cpython-37.pyc +0 -0
- config/categories_count_32s2fpsnew.json +1 -0
- config/dataset_config.py +94 -0
- config/yowo_v2_config.py +84 -0
- dataset/__init__.py +0 -0
- dataset/__pycache__/__init__.cpython-310.pyc +0 -0
- dataset/__pycache__/__init__.cpython-37.pyc +0 -0
- dataset/__pycache__/ava.cpython-310.pyc +0 -0
- dataset/__pycache__/ava.cpython-37.pyc +0 -0
- dataset/__pycache__/ava_helper.cpython-310.pyc +0 -0
- dataset/__pycache__/ava_helper.cpython-37.pyc +0 -0
- dataset/__pycache__/transforms.cpython-310.pyc +0 -0
- dataset/__pycache__/transforms.cpython-37.pyc +0 -0
- dataset/__pycache__/ucf_jhmdb.cpython-310.pyc +0 -0
- dataset/__pycache__/ucf_jhmdb.cpython-37.pyc +0 -0
- dataset/ava.py +300 -0
- dataset/ava_helper.py +231 -0
- dataset/transforms.py +176 -0
- dataset/ucf24_demo/v_Basketball_g01_c02.mp4 +0 -0
- dataset/ucf24_demo/v_Basketball_g07_c04.mp4 +0 -0
- dataset/ucf24_demo/v_Biking_g01_c01.mp4 +3 -0
- dataset/ucf24_demo/v_CliffDiving_g03_c01.mp4 +3 -0
- dataset/ucf24_demo/v_Fencing_g01_c06.mp4 +3 -0
- dataset/ucf24_demo/v_HorseRiding_g01_c03.mp4 +3 -0
- dataset/ucf24_demo/v_IceDancing_g02_c05.mp4 +3 -0
- dataset/ucf24_demo/v_SalsaSpin_g03_c01.mp4 +3 -0
- dataset/ucf24_demo/v_SkateBoarding_g02_c01.mp4 +3 -0
- dataset/ucf_jhmdb.py +311 -0
- evaluator/__init__.py +0 -0
- evaluator/__pycache__/__init__.cpython-310.pyc +0 -0
- evaluator/__pycache__/__init__.cpython-37.pyc +0 -0
- evaluator/__pycache__/ava_eval_helper.cpython-310.pyc +0 -0
- evaluator/__pycache__/ava_eval_helper.cpython-37.pyc +0 -0
- evaluator/__pycache__/ava_evaluator.cpython-310.pyc +0 -0
- evaluator/__pycache__/ava_evaluator.cpython-37.pyc +0 -0
- evaluator/__pycache__/cal_frame_mAP.cpython-310.pyc +0 -0
- evaluator/__pycache__/cal_frame_mAP.cpython-37.pyc +0 -0
- evaluator/__pycache__/cal_video_mAP.cpython-310.pyc +0 -0
- evaluator/__pycache__/cal_video_mAP.cpython-37.pyc +0 -0
- evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-310.pyc +0 -0
- evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-37.pyc +0 -0
- evaluator/__pycache__/utils.cpython-310.pyc +0 -0
- evaluator/__pycache__/utils.cpython-37.pyc +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
dataset/ucf24_demo/v_Biking_g01_c01.mp4 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
dataset/ucf24_demo/v_CliffDiving_g03_c01.mp4 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
dataset/ucf24_demo/v_Fencing_g01_c06.mp4 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
dataset/ucf24_demo/v_HorseRiding_g01_c03.mp4 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
dataset/ucf24_demo/v_IceDancing_g02_c05.mp4 filter=lfs diff=lfs merge=lfs -text
|
41 |
+
dataset/ucf24_demo/v_SalsaSpin_g03_c01.mp4 filter=lfs diff=lfs merge=lfs -text
|
42 |
+
dataset/ucf24_demo/v_SkateBoarding_g02_c01.mp4 filter=lfs diff=lfs merge=lfs -text
|
config/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .dataset_config import dataset_config
|
2 |
+
from .yowo_v2_config import yowo_v2_config
|
3 |
+
|
4 |
+
|
5 |
+
def build_model_config(args):
|
6 |
+
print('==============================')
|
7 |
+
print('Model Config: {} '.format(args.version.upper()))
|
8 |
+
|
9 |
+
if 'yowo_v2_' in args.version:
|
10 |
+
m_cfg = yowo_v2_config[args.version]
|
11 |
+
|
12 |
+
return m_cfg
|
13 |
+
|
14 |
+
|
15 |
+
def build_dataset_config(args):
|
16 |
+
print('==============================')
|
17 |
+
print('Dataset Config: {} '.format(args.dataset.upper()))
|
18 |
+
|
19 |
+
d_cfg = dataset_config[args.dataset]
|
20 |
+
|
21 |
+
return d_cfg
|
config/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (697 Bytes). View file
|
|
config/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (710 Bytes). View file
|
|
config/__pycache__/dataset_config.cpython-310.pyc
ADDED
Binary file (1.72 kB). View file
|
|
config/__pycache__/dataset_config.cpython-37.pyc
ADDED
Binary file (1.54 kB). View file
|
|
config/__pycache__/yowo_v2_config.cpython-310.pyc
ADDED
Binary file (846 Bytes). View file
|
|
config/__pycache__/yowo_v2_config.cpython-37.pyc
ADDED
Binary file (836 Bytes). View file
|
|
config/categories_count_32s2fpsnew.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"fighter": 9420, "threatener": 3964, "victim": 9264, "outsider": 14812}
|
config/dataset_config.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Dataset configuration
|
2 |
+
|
3 |
+
|
4 |
+
dataset_config = {
|
5 |
+
'ucf24': {
|
6 |
+
# dataset
|
7 |
+
'gt_folder': './evaluator/groundtruths_ucf_jhmdb/groundtruths_ucf/',
|
8 |
+
# input size
|
9 |
+
'train_size': 224,
|
10 |
+
'test_size': 224,
|
11 |
+
# transform
|
12 |
+
'jitter': 0.2,
|
13 |
+
'hue': 0.1,
|
14 |
+
'saturation': 1.5,
|
15 |
+
'exposure': 1.5,
|
16 |
+
'sampling_rate': 1,
|
17 |
+
# cls label
|
18 |
+
'multi_hot': False, # one hot
|
19 |
+
# optimizer
|
20 |
+
'optimizer': 'adamw',
|
21 |
+
'momentum': 0.9,
|
22 |
+
'weight_decay': 5e-4,
|
23 |
+
# warmup strategy
|
24 |
+
'warmup': 'linear',
|
25 |
+
'warmup_factor': 0.00066667,
|
26 |
+
'wp_iter': 500,
|
27 |
+
# class names
|
28 |
+
'valid_num_classes': 24,
|
29 |
+
'label_map': (
|
30 |
+
'Basketball', 'BasketballDunk', 'Biking', 'CliffDiving',
|
31 |
+
'CricketBowling', 'Diving', 'Fencing', 'FloorGymnastics',
|
32 |
+
'GolfSwing', 'HorseRiding', 'IceDancing', 'LongJump',
|
33 |
+
'PoleVault', 'RopeClimbing', 'SalsaSpin', 'SkateBoarding',
|
34 |
+
'Skiing', 'Skijet', 'SoccerJuggling', 'Surfing',
|
35 |
+
'TennisSwing', 'TrampolineJumping', 'VolleyballSpiking', 'WalkingWithDog'
|
36 |
+
),
|
37 |
+
},
|
38 |
+
|
39 |
+
'ava_v2.2':{
|
40 |
+
# dataset
|
41 |
+
'frames_dir': 'frames/',
|
42 |
+
'frame_list': 'frame_lists/',
|
43 |
+
'annotation_dir': 'annotations/',
|
44 |
+
'train_gt_box_list': 'train.csv',
|
45 |
+
'val_gt_box_list': 'val.csv',
|
46 |
+
'train_exclusion_file': 'ava_train_excluded_timestamps_v2.2.csv',
|
47 |
+
'val_exclusion_file': 'ava_val_excluded_timestamps_v2.2.csv',
|
48 |
+
'labelmap_file': 'ava_action_list_v2.1_for_activitynet_2018.pbtxt', # 'ava_v2.2/ava_action_list_v2.2.pbtxt',
|
49 |
+
'class_ratio_file': 'categories_count_32s2fpsnew.json',
|
50 |
+
'backup_dir': 'C:/Users/Administrator/Downloads/YOWOv2/backup_dir',
|
51 |
+
# input size
|
52 |
+
'train_size': 224,
|
53 |
+
'test_size': 224,
|
54 |
+
# transform
|
55 |
+
'jitter': 0.2,
|
56 |
+
'hue': 0.1,
|
57 |
+
'saturation': 1.5,
|
58 |
+
'exposure': 1.5,
|
59 |
+
'sampling_rate': 1,
|
60 |
+
# cls label
|
61 |
+
'multi_hot': True, # multi hot
|
62 |
+
# train config
|
63 |
+
'optimizer': 'adamw',
|
64 |
+
'momentum': 0.9,
|
65 |
+
'weight_decay': 5e-4,
|
66 |
+
# warmup strategy
|
67 |
+
'warmup': 'linear',
|
68 |
+
'warmup_factor': 0.00066667,
|
69 |
+
'wp_iter': 500,
|
70 |
+
# class names
|
71 |
+
'valid_num_classes': 3,
|
72 |
+
'label_map': ('bully', 'victim', 'outsider')
|
73 |
+
# 'valid_num_classes': 80,
|
74 |
+
# 'label_map': (
|
75 |
+
# 'bend/bow(at the waist)', 'crawl', 'crouch/kneel', 'dance', 'fall down', # 1-5
|
76 |
+
# 'get up', 'jump/leap', 'lie/sleep', 'martial art', 'run/jog', # 6-10
|
77 |
+
# 'sit', 'stand', 'swim', 'walk', 'answer phone', # 11-15
|
78 |
+
# 'brush teeth', 'carry/hold (an object)', 'catch (an object)', 'chop', 'climb (e.g. a mountain)', # 16-20
|
79 |
+
# 'clink glass', 'close (e.g., a door, a box)', 'cook', 'cut', 'dig', # 21-25
|
80 |
+
# 'dress/put on clothing', 'drink', 'drive (e.g., a car, a truck)', 'eat', 'enter', # 26-30
|
81 |
+
# 'exit', 'extract', 'fishing', 'hit (an object)', 'kick (an object)', # 31-35
|
82 |
+
# 'lift/pick up', 'listen (e.g., to music)', 'open (e.g., a window, a car door)', 'paint', 'play board game', # 36-40
|
83 |
+
# 'play musical instrument', 'play with pets', 'point to (an object)', 'press','pull (an object)', # 41-45
|
84 |
+
# 'push (an object)', 'put down', 'read', 'ride (e.g., a bike, a car, a horse)', 'row boat', # 46-50
|
85 |
+
# 'sail boat', 'shoot', 'shovel', 'smoke', 'stir', # 51-55
|
86 |
+
# 'take a photo', 'text on/look at a cellphone', 'throw', 'touch (an object)', 'turn (e.g., a screwdriver)', # 56-60
|
87 |
+
# 'watch (e.g., TV)', 'work on a computer', 'write', 'fight/hit (a person)', 'give/serve (an object) to (a person)', # 61-65
|
88 |
+
# 'grab (a person)', 'hand clap', 'hand shake', 'hand wave', 'hug (a person)', # 66-70
|
89 |
+
# 'kick (a person)', 'kiss (a person)', 'lift (a person)', 'listen to (a person)', 'play with kids', # 71-75
|
90 |
+
# 'push (another person)', 'sing to (e.g., self, a person, a group)', 'take (an object) from (a person)', # 76-78
|
91 |
+
# 'talk to (e.g., self, a person, a group)', 'watch (a person)' # 79-80
|
92 |
+
# ),
|
93 |
+
}
|
94 |
+
}
|
config/yowo_v2_config.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model configuration
|
2 |
+
|
3 |
+
|
4 |
+
yowo_v2_config = {
|
5 |
+
'yowo_v2_nano': {
|
6 |
+
# backbone
|
7 |
+
## 2D
|
8 |
+
'backbone_2d': 'yolo_free_nano',
|
9 |
+
'pretrained_2d': True,
|
10 |
+
'stride': [8, 16, 32],
|
11 |
+
## 3D
|
12 |
+
'backbone_3d': 'shufflenetv2',
|
13 |
+
'model_size': '1.0x',
|
14 |
+
'pretrained_3d': True,
|
15 |
+
'memory_momentum': 0.9,
|
16 |
+
# head
|
17 |
+
'head_dim': 64,
|
18 |
+
'head_norm': 'BN',
|
19 |
+
'head_act': 'lrelu',
|
20 |
+
'num_cls_heads': 2,
|
21 |
+
'num_reg_heads': 2,
|
22 |
+
'head_depthwise': True,
|
23 |
+
},
|
24 |
+
|
25 |
+
'yowo_v2_tiny': {
|
26 |
+
# backbone
|
27 |
+
## 2D
|
28 |
+
'backbone_2d': 'yolo_free_tiny',
|
29 |
+
'pretrained_2d': True,
|
30 |
+
'stride': [8, 16, 32],
|
31 |
+
## 3D
|
32 |
+
'backbone_3d': 'shufflenetv2',
|
33 |
+
'model_size': '2.0x',
|
34 |
+
'pretrained_3d': True,
|
35 |
+
'memory_momentum': 0.9,
|
36 |
+
# head
|
37 |
+
'head_dim': 64,
|
38 |
+
'head_norm': 'BN',
|
39 |
+
'head_act': 'lrelu',
|
40 |
+
'num_cls_heads': 2,
|
41 |
+
'num_reg_heads': 2,
|
42 |
+
'head_depthwise': False,
|
43 |
+
},
|
44 |
+
|
45 |
+
'yowo_v2_medium': {
|
46 |
+
# backbone
|
47 |
+
## 2D
|
48 |
+
'backbone_2d': 'yolo_free_large',
|
49 |
+
'pretrained_2d': True,
|
50 |
+
'stride': [8, 16, 32],
|
51 |
+
## 3D
|
52 |
+
'backbone_3d': 'shufflenetv2',
|
53 |
+
'model_size': '2.0x',
|
54 |
+
'pretrained_3d': True,
|
55 |
+
'memory_momentum': 0.9,
|
56 |
+
# head
|
57 |
+
'head_dim': 128,
|
58 |
+
'head_norm': 'BN',
|
59 |
+
'head_act': 'silu',
|
60 |
+
'num_cls_heads': 2,
|
61 |
+
'num_reg_heads': 2,
|
62 |
+
'head_depthwise': False,
|
63 |
+
},
|
64 |
+
|
65 |
+
'yowo_v2_large': {
|
66 |
+
# backbone
|
67 |
+
## 2D
|
68 |
+
'backbone_2d': 'yolo_free_large',
|
69 |
+
'pretrained_2d': True,
|
70 |
+
'stride': [8, 16, 32],
|
71 |
+
## 3D
|
72 |
+
'backbone_3d': 'resnext101',
|
73 |
+
'pretrained_3d': True,
|
74 |
+
'memory_momentum': 0.9,
|
75 |
+
# head
|
76 |
+
'head_dim': 256,
|
77 |
+
'head_norm': 'BN',
|
78 |
+
'head_act': 'silu',
|
79 |
+
'num_cls_heads': 2,
|
80 |
+
'num_reg_heads': 2,
|
81 |
+
'head_depthwise': False,
|
82 |
+
},
|
83 |
+
|
84 |
+
}
|
dataset/__init__.py
ADDED
File without changes
|
dataset/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (131 Bytes). View file
|
|
dataset/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (125 Bytes). View file
|
|
dataset/__pycache__/ava.cpython-310.pyc
ADDED
Binary file (6.81 kB). View file
|
|
dataset/__pycache__/ava.cpython-37.pyc
ADDED
Binary file (6.75 kB). View file
|
|
dataset/__pycache__/ava_helper.cpython-310.pyc
ADDED
Binary file (5.98 kB). View file
|
|
dataset/__pycache__/ava_helper.cpython-37.pyc
ADDED
Binary file (5.92 kB). View file
|
|
dataset/__pycache__/transforms.cpython-310.pyc
ADDED
Binary file (5.23 kB). View file
|
|
dataset/__pycache__/transforms.cpython-37.pyc
ADDED
Binary file (5.3 kB). View file
|
|
dataset/__pycache__/ucf_jhmdb.cpython-310.pyc
ADDED
Binary file (6.49 kB). View file
|
|
dataset/__pycache__/ucf_jhmdb.cpython-37.pyc
ADDED
Binary file (6.46 kB). View file
|
|
dataset/ava.py
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
import os
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from torch.utils.data import Dataset
|
9 |
+
from PIL import Image
|
10 |
+
|
11 |
+
try:
|
12 |
+
import ava_helper
|
13 |
+
except:
|
14 |
+
from . import ava_helper
|
15 |
+
|
16 |
+
|
17 |
+
# Dataset for AVA
|
18 |
+
class AVA_Dataset(Dataset):
|
19 |
+
def __init__(self,
|
20 |
+
cfg,
|
21 |
+
data_root,
|
22 |
+
is_train=False,
|
23 |
+
img_size=224,
|
24 |
+
transform=None,
|
25 |
+
len_clip=8,
|
26 |
+
sampling_rate=1):
|
27 |
+
self.num_classes = 3
|
28 |
+
self.data_root = data_root
|
29 |
+
self.frames_dir = os.path.join(data_root, cfg['frames_dir'])
|
30 |
+
self.frame_list = os.path.join(data_root, cfg['frame_list'])
|
31 |
+
self.annotation_dir = os.path.join(data_root, cfg['annotation_dir'])
|
32 |
+
self.labelmap_file = os.path.join(data_root, cfg['annotation_dir'], cfg['labelmap_file'])
|
33 |
+
if is_train:
|
34 |
+
self.gt_box_list = os.path.join(self.annotation_dir, cfg['train_gt_box_list'])
|
35 |
+
self.exclusion_file = os.path.join(self.annotation_dir, cfg['train_exclusion_file'])
|
36 |
+
else:
|
37 |
+
self.gt_box_list = os.path.join(self.annotation_dir, cfg['val_gt_box_list'])
|
38 |
+
self.exclusion_file = os.path.join(self.annotation_dir, cfg['val_exclusion_file'])
|
39 |
+
|
40 |
+
self.transform = transform
|
41 |
+
self.is_train = is_train
|
42 |
+
|
43 |
+
self.img_size = img_size
|
44 |
+
self.len_clip = len_clip
|
45 |
+
self.sampling_rate = sampling_rate
|
46 |
+
self.seq_len = self.len_clip * self.sampling_rate
|
47 |
+
|
48 |
+
# load ava data
|
49 |
+
self._load_data()
|
50 |
+
|
51 |
+
|
52 |
+
def _load_data(self):
|
53 |
+
# Loading frame paths.
|
54 |
+
(
|
55 |
+
self._image_paths,
|
56 |
+
self._video_idx_to_name,
|
57 |
+
) = ava_helper.load_image_lists(
|
58 |
+
self.frames_dir,
|
59 |
+
self.frame_list,
|
60 |
+
self.is_train
|
61 |
+
)
|
62 |
+
|
63 |
+
# Loading annotations for boxes and labels.
|
64 |
+
# boxes_and_labels: {'<video_name>': {<frame_num>: a list of [box_i, box_i_labels]} }
|
65 |
+
boxes_and_labels = ava_helper.load_boxes_and_labels(
|
66 |
+
self.gt_box_list,
|
67 |
+
self.exclusion_file,
|
68 |
+
self.is_train,
|
69 |
+
full_test_on_val=False
|
70 |
+
)
|
71 |
+
|
72 |
+
assert len(boxes_and_labels) == len(self._image_paths)
|
73 |
+
|
74 |
+
# boxes_and_labels: a list of {<frame_num>: a list of [box_i, box_i_labels]}
|
75 |
+
boxes_and_labels = [
|
76 |
+
boxes_and_labels[self._video_idx_to_name[i]]
|
77 |
+
for i in range(len(self._image_paths))
|
78 |
+
]
|
79 |
+
|
80 |
+
# Get indices of keyframes and corresponding boxes and labels.
|
81 |
+
# _keyframe_indices: [video_idx, sec_idx, sec, frame_index]
|
82 |
+
# _keyframe_boxes_and_labels: list[list[list]], outer is video_idx, middle is sec_idx,
|
83 |
+
# inner is a list of [box_i, box_i_labels]
|
84 |
+
(
|
85 |
+
self._keyframe_indices,
|
86 |
+
self._keyframe_boxes_and_labels,
|
87 |
+
) = ava_helper.get_keyframe_data(boxes_and_labels)
|
88 |
+
|
89 |
+
# Calculate the number of used boxes.
|
90 |
+
self._num_boxes_used = ava_helper.get_num_boxes_used(
|
91 |
+
self._keyframe_indices, self._keyframe_boxes_and_labels
|
92 |
+
)
|
93 |
+
|
94 |
+
self._max_objs = ava_helper.get_max_objs(
|
95 |
+
self._keyframe_indices, self._keyframe_boxes_and_labels
|
96 |
+
)
|
97 |
+
|
98 |
+
print("=== AVA dataset summary ===")
|
99 |
+
print("Train: {}".format(self.is_train))
|
100 |
+
print("Number of videos: {}".format(len(self._image_paths)))
|
101 |
+
total_frames = sum(
|
102 |
+
len(video_img_paths) for video_img_paths in self._image_paths
|
103 |
+
)
|
104 |
+
print("Number of frames: {}".format(total_frames))
|
105 |
+
print("Number of key frames: {}".format(len(self)))
|
106 |
+
print("Number of boxes: {}.".format(self._num_boxes_used))
|
107 |
+
|
108 |
+
|
109 |
+
def __len__(self):
|
110 |
+
return len(self._keyframe_indices)
|
111 |
+
|
112 |
+
|
113 |
+
def get_sequence(self, center_idx, half_len, sample_rate, num_frames):
|
114 |
+
"""
|
115 |
+
Sample frames among the corresponding clip.
|
116 |
+
|
117 |
+
Args:
|
118 |
+
center_idx (int): center frame idx for current clip
|
119 |
+
half_len (int): half of the clip length
|
120 |
+
sample_rate (int): sampling rate for sampling frames inside of the clip
|
121 |
+
num_frames (int): number of expected sampled frames
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
seq (list): list of indexes of sampled frames in this clip.
|
125 |
+
"""
|
126 |
+
# seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate))
|
127 |
+
seq = list(range(center_idx - half_len*2 + 1*sample_rate, center_idx+1*sample_rate, sample_rate))
|
128 |
+
|
129 |
+
for seq_idx in range(len(seq)):
|
130 |
+
if seq[seq_idx] < 0:
|
131 |
+
seq[seq_idx] = 0
|
132 |
+
elif seq[seq_idx] >= num_frames:
|
133 |
+
seq[seq_idx] = num_frames - 1
|
134 |
+
return seq
|
135 |
+
|
136 |
+
|
137 |
+
def get_frame_idx(self, latest_idx, sample_length, sample_rate, num_frames):
|
138 |
+
"""
|
139 |
+
Sample frames among the corresponding clip. But see keyframe as the latest frame,
|
140 |
+
instead of viewing it in center
|
141 |
+
"""
|
142 |
+
# seq = list(range(latest_idx - sample_length + 1, latest_idx + 1, sample_rate))
|
143 |
+
seq = list(range(latest_idx, latest_idx - sample_length, -sample_rate))
|
144 |
+
seq.reverse()
|
145 |
+
for seq_idx in range(len(seq)):
|
146 |
+
if seq[seq_idx] < 0:
|
147 |
+
seq[seq_idx] = 0
|
148 |
+
elif seq[seq_idx] >= num_frames:
|
149 |
+
seq[seq_idx] = num_frames - 1
|
150 |
+
|
151 |
+
return seq
|
152 |
+
|
153 |
+
|
154 |
+
def __getitem__(self, idx):
|
155 |
+
# load a data
|
156 |
+
frame_idx, video_clip, target = self.pull_item(idx)
|
157 |
+
|
158 |
+
return frame_idx, video_clip, target
|
159 |
+
|
160 |
+
|
161 |
+
def pull_item(self, idx):
|
162 |
+
# Get the frame idxs for current clip. We can use it as center or latest
|
163 |
+
video_idx, sec_idx, sec, frame_idx = self._keyframe_indices[idx]
|
164 |
+
clip_label_list = self._keyframe_boxes_and_labels[video_idx][sec_idx]
|
165 |
+
|
166 |
+
# check label list
|
167 |
+
assert len(clip_label_list) > 0
|
168 |
+
assert len(clip_label_list) <= self._max_objs
|
169 |
+
|
170 |
+
# get a sequence
|
171 |
+
seq = self.get_sequence(
|
172 |
+
frame_idx,
|
173 |
+
self.seq_len // 2,
|
174 |
+
self.sampling_rate,
|
175 |
+
num_frames=len(self._image_paths[video_idx]),
|
176 |
+
)
|
177 |
+
image_paths = [self._image_paths[video_idx][frame - 1] for frame in seq]
|
178 |
+
#print('video_idx: ', video_idx, 'frame_idx', frame_idx)
|
179 |
+
keyframe_info = self._image_paths[video_idx][frame_idx - 1]
|
180 |
+
#print(keyframe_info)
|
181 |
+
|
182 |
+
# load a video clip
|
183 |
+
video_clip = []
|
184 |
+
for img_path in image_paths:
|
185 |
+
frame = Image.open(img_path).convert('RGB')
|
186 |
+
video_clip.append(frame)
|
187 |
+
ow, oh = frame.width, frame.height
|
188 |
+
|
189 |
+
# Get boxes and labels for current clip.
|
190 |
+
boxes = []
|
191 |
+
labels = []
|
192 |
+
for box_labels in clip_label_list:
|
193 |
+
bbox = box_labels[0]
|
194 |
+
label = box_labels[1]
|
195 |
+
multi_hot_label = np.zeros(1 + self.num_classes)
|
196 |
+
multi_hot_label[..., label] = 1.0
|
197 |
+
|
198 |
+
boxes.append(bbox)
|
199 |
+
labels.append(multi_hot_label[..., 1:].tolist())
|
200 |
+
|
201 |
+
boxes = np.array(boxes).reshape(-1, 4)
|
202 |
+
# renormalize bbox
|
203 |
+
boxes[..., [0, 2]] *= ow
|
204 |
+
boxes[..., [1, 3]] *= oh
|
205 |
+
labels = np.array(labels).reshape(-1, self.num_classes)
|
206 |
+
|
207 |
+
# target: [N, 4 + C]
|
208 |
+
target = np.concatenate([boxes, labels], axis=-1)
|
209 |
+
|
210 |
+
# transform
|
211 |
+
video_clip, target = self.transform(video_clip, target)
|
212 |
+
# List [T, 3, H, W] -> [3, T, H, W]
|
213 |
+
video_clip = torch.stack(video_clip, dim=1)
|
214 |
+
|
215 |
+
# reformat target
|
216 |
+
target = {
|
217 |
+
'boxes': target[:, :4].float(), # [N, 4]
|
218 |
+
'labels': target[:, 4:].long(), # [N, C]
|
219 |
+
'orig_size': [ow, oh],
|
220 |
+
'video_idx': video_idx,
|
221 |
+
'sec': sec,
|
222 |
+
|
223 |
+
}
|
224 |
+
|
225 |
+
return [video_idx, sec], video_clip, target
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
if __name__ == '__main__':
|
230 |
+
import cv2
|
231 |
+
from transforms import Augmentation, BaseTransform
|
232 |
+
|
233 |
+
is_train = False
|
234 |
+
img_size = 224
|
235 |
+
len_clip = 16
|
236 |
+
sampling_rate = 1
|
237 |
+
dataset_config = {
|
238 |
+
'data_root': 'C:/Users/Administrator/Downloads/YOWOv2/data/clip32s_2fps_new',
|
239 |
+
'frames_dir': 'frames/',
|
240 |
+
'frame_list': 'frame_lists/',
|
241 |
+
'annotation_dir': 'annotations/',
|
242 |
+
'train_gt_box_list': 'train.csv',
|
243 |
+
'val_gt_box_list': 'val.csv',
|
244 |
+
'train_exclusion_file': 'ava_train_excluded_timestamps_v2.2.csv',
|
245 |
+
'val_exclusion_file': 'ava_val_excluded_timestamps_v2.2.csv',
|
246 |
+
'labelmap_file': 'ava_action_list_v2.1_for_activitynet_2018.pbtxt',
|
247 |
+
}
|
248 |
+
trans_config = {
|
249 |
+
'jitter': 0.2,
|
250 |
+
'hue': 0.1,
|
251 |
+
'saturation': 1.5,
|
252 |
+
'exposure': 1.5
|
253 |
+
}
|
254 |
+
train_transform = Augmentation(
|
255 |
+
img_size=img_size,
|
256 |
+
jitter=trans_config['jitter'],
|
257 |
+
saturation=trans_config['saturation'],
|
258 |
+
exposure=trans_config['exposure']
|
259 |
+
)
|
260 |
+
val_transform = BaseTransform(img_size=img_size)
|
261 |
+
|
262 |
+
train_dataset = AVA_Dataset(
|
263 |
+
cfg=dataset_config,
|
264 |
+
data_root=dataset_config['data_root'],
|
265 |
+
is_train=is_train,
|
266 |
+
img_size=img_size,
|
267 |
+
transform=train_transform,
|
268 |
+
len_clip=len_clip,
|
269 |
+
sampling_rate=sampling_rate
|
270 |
+
)
|
271 |
+
|
272 |
+
print(len(train_dataset))
|
273 |
+
for i in range(len(train_dataset)):
|
274 |
+
frame_id, video_clip, target = train_dataset[i]
|
275 |
+
key_frame = video_clip[:, -1, :, :]
|
276 |
+
|
277 |
+
# to numpy
|
278 |
+
key_frame = key_frame.permute(1, 2, 0).numpy()
|
279 |
+
key_frame = key_frame.astype(np.uint8)
|
280 |
+
|
281 |
+
# to BGR
|
282 |
+
key_frame = key_frame[..., (2, 1, 0)]
|
283 |
+
H, W, C = key_frame.shape
|
284 |
+
|
285 |
+
key_frame = key_frame.copy()
|
286 |
+
bboxes = target['boxes']
|
287 |
+
labels = target['labels']
|
288 |
+
|
289 |
+
for box, cls_id in zip(bboxes, labels):
|
290 |
+
x1, y1, x2, y2 = box
|
291 |
+
x1 = int(x1 * W)
|
292 |
+
y1 = int(y1 * H)
|
293 |
+
x2 = int(x2 * W)
|
294 |
+
y2 = int(y2 * H)
|
295 |
+
key_frame = cv2.rectangle(key_frame, (x1, y1), (x2, y2), (255, 0, 0))
|
296 |
+
|
297 |
+
# cv2 show
|
298 |
+
cv2.imshow('key frame', key_frame)
|
299 |
+
cv2.waitKey(0)
|
300 |
+
|
dataset/ava_helper.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
3 |
+
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import csv
|
7 |
+
from collections import defaultdict
|
8 |
+
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
FPS = 15
|
12 |
+
AVA_VALID_FRAMES = range(0, 64)
|
13 |
+
#AVA_VALID_FRAMES = range(902, 1799)
|
14 |
+
|
15 |
+
|
16 |
+
def make_image_key(video_id, timestamp):
|
17 |
+
"""Returns a unique identifier for a video id & timestamp."""
|
18 |
+
return "%s,%04d" % (video_id, int(timestamp))
|
19 |
+
|
20 |
+
|
21 |
+
def read_exclusions(exclusions_file):
|
22 |
+
"""Reads a CSV file of excluded timestamps.
|
23 |
+
Args:
|
24 |
+
exclusions_file: A file object containing a csv of video-id,timestamp.
|
25 |
+
Returns:
|
26 |
+
A set of strings containing excluded image keys, e.g. "aaaaaaaaaaa,0904",
|
27 |
+
or an empty set if exclusions file is None.
|
28 |
+
"""
|
29 |
+
excluded = set()
|
30 |
+
if exclusions_file:
|
31 |
+
with open(exclusions_file, "r") as f:
|
32 |
+
reader = csv.reader(f)
|
33 |
+
for row in reader:
|
34 |
+
assert len(row) == 2, "Expected only 2 columns, got: " + row
|
35 |
+
excluded.add(make_image_key(row[0], row[1]))
|
36 |
+
return excluded
|
37 |
+
|
38 |
+
|
39 |
+
def load_image_lists(frames_dir, frame_list, is_train):
|
40 |
+
"""
|
41 |
+
Loading image paths from corresponding files.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
frames_dir (str): path to frames dir.
|
45 |
+
frame_list (str): path to frame list.
|
46 |
+
is_train (bool): if it is training dataset or not.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
image_paths (list[list]): a list of items. Each item (also a list)
|
50 |
+
corresponds to one video and contains the paths of images for
|
51 |
+
this video.
|
52 |
+
video_idx_to_name (list): a list which stores video names.
|
53 |
+
"""
|
54 |
+
# frame_list_dir is /data3/ava/frame_lists/
|
55 |
+
# contains 'train.csv' and 'val.csv'
|
56 |
+
if is_train:
|
57 |
+
list_name = "train.csv"
|
58 |
+
else:
|
59 |
+
list_name = "val.csv"
|
60 |
+
|
61 |
+
list_filename = os.path.join(frame_list, list_name)
|
62 |
+
|
63 |
+
image_paths = defaultdict(list)
|
64 |
+
video_name_to_idx = {}
|
65 |
+
video_idx_to_name = []
|
66 |
+
with open(list_filename, "r") as f:
|
67 |
+
f.readline()
|
68 |
+
for line in f:
|
69 |
+
row = line.split()
|
70 |
+
row = row[0].split(',')
|
71 |
+
# The format of each row should follow:
|
72 |
+
# original_vido_id video_id frame_id path labels.
|
73 |
+
assert len(row) == 5
|
74 |
+
video_name = row[0]
|
75 |
+
|
76 |
+
if video_name not in video_name_to_idx:
|
77 |
+
idx = len(video_name_to_idx)
|
78 |
+
video_name_to_idx[video_name] = idx
|
79 |
+
video_idx_to_name.append(video_name)
|
80 |
+
|
81 |
+
data_key = video_name_to_idx[video_name]
|
82 |
+
|
83 |
+
image_paths[data_key].append(os.path.join(frames_dir, row[3]))
|
84 |
+
|
85 |
+
image_paths = [image_paths[i] for i in range(len(image_paths))]
|
86 |
+
|
87 |
+
print("Finished loading image paths from: {}".format(list_filename))
|
88 |
+
|
89 |
+
return image_paths, video_idx_to_name
|
90 |
+
|
91 |
+
|
92 |
+
def load_boxes_and_labels(gt_box_list, exclusion_file, is_train=False, full_test_on_val=False):
|
93 |
+
"""
|
94 |
+
Loading boxes and labels from csv files.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
cfg (CfgNode): config.
|
98 |
+
mode (str): 'train', 'val', or 'test' mode.
|
99 |
+
Returns:
|
100 |
+
all_boxes (dict): a dict which maps from `video_name` and
|
101 |
+
`frame_sec` to a list of `box`. Each `box` is a
|
102 |
+
[`box_coord`, `box_labels`] where `box_coord` is the
|
103 |
+
coordinates of box and 'box_labels` are the corresponding
|
104 |
+
labels for the box.
|
105 |
+
"""
|
106 |
+
ann_filename = gt_box_list
|
107 |
+
all_boxes = {}
|
108 |
+
count = 0
|
109 |
+
unique_box_count = 0
|
110 |
+
excluded_keys = read_exclusions(exclusion_file)
|
111 |
+
|
112 |
+
with open(ann_filename, 'r') as f:
|
113 |
+
for line in f:
|
114 |
+
row = line.strip().split(',')
|
115 |
+
|
116 |
+
video_name, frame_sec = row[0], int(row[1])
|
117 |
+
key = "%s,%04d" % (video_name, frame_sec)
|
118 |
+
# if mode == 'train' and key in excluded_keys:
|
119 |
+
if key in excluded_keys:
|
120 |
+
print("Found {} to be excluded...".format(key))
|
121 |
+
continue
|
122 |
+
|
123 |
+
# Only select frame_sec % 4 = 0 samples for validation if not
|
124 |
+
# set FULL_TEST_ON_VAL (default False)
|
125 |
+
if not is_train and not full_test_on_val and frame_sec % 4 != 0:
|
126 |
+
continue
|
127 |
+
# Box with [x1, y1, x2, y2] with a range of [0, 1] as float
|
128 |
+
box_key = ",".join(row[2:6])
|
129 |
+
box = list(map(float, row[2:6]))
|
130 |
+
label = -1 if row[6] == "" else int(row[6])
|
131 |
+
if video_name not in all_boxes:
|
132 |
+
all_boxes[video_name] = {}
|
133 |
+
for sec in AVA_VALID_FRAMES:
|
134 |
+
all_boxes[video_name][sec] = {}
|
135 |
+
if box_key not in all_boxes[video_name][frame_sec]:
|
136 |
+
all_boxes[video_name][frame_sec][box_key] = [box, []]
|
137 |
+
unique_box_count += 1
|
138 |
+
|
139 |
+
all_boxes[video_name][frame_sec][box_key][1].append(label)
|
140 |
+
if label != -1:
|
141 |
+
count += 1
|
142 |
+
|
143 |
+
for video_name in all_boxes.keys():
|
144 |
+
for frame_sec in all_boxes[video_name].keys():
|
145 |
+
# Save in format of a list of [box_i, box_i_labels].
|
146 |
+
all_boxes[video_name][frame_sec] = list(
|
147 |
+
all_boxes[video_name][frame_sec].values()
|
148 |
+
)
|
149 |
+
|
150 |
+
print("Finished loading annotations from: %s" % ", ".join([ann_filename]))
|
151 |
+
print("Number of unique boxes: %d" % unique_box_count)
|
152 |
+
print("Number of annotations: %d" % count)
|
153 |
+
|
154 |
+
return all_boxes
|
155 |
+
|
156 |
+
|
157 |
+
def get_keyframe_data(boxes_and_labels):
|
158 |
+
"""
|
159 |
+
Getting keyframe indices, boxes and labels in the dataset.
|
160 |
+
|
161 |
+
Args:
|
162 |
+
boxes_and_labels (list[dict]): a list which maps from video_idx to a dict.
|
163 |
+
Each dict `frame_sec` to a list of boxes and corresponding labels.
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
keyframe_indices (list): a list of indices of the keyframes.
|
167 |
+
keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from
|
168 |
+
video_idx and sec_idx to a list of boxes and corresponding labels.
|
169 |
+
"""
|
170 |
+
|
171 |
+
def sec_to_frame(sec):
|
172 |
+
"""
|
173 |
+
Convert time index (in second) to frame index.
|
174 |
+
0: 900
|
175 |
+
30: 901
|
176 |
+
"""
|
177 |
+
#return (sec - 900) * FPS
|
178 |
+
return sec+1
|
179 |
+
|
180 |
+
keyframe_indices = []
|
181 |
+
keyframe_boxes_and_labels = []
|
182 |
+
count = 0
|
183 |
+
for video_idx in range(len(boxes_and_labels)):
|
184 |
+
sec_idx = 0
|
185 |
+
keyframe_boxes_and_labels.append([])
|
186 |
+
for sec in boxes_and_labels[video_idx].keys():
|
187 |
+
if sec not in AVA_VALID_FRAMES:
|
188 |
+
continue
|
189 |
+
|
190 |
+
if len(boxes_and_labels[video_idx][sec]) > 0:
|
191 |
+
keyframe_indices.append(
|
192 |
+
(video_idx, sec_idx, sec, sec_to_frame(sec))
|
193 |
+
)
|
194 |
+
keyframe_boxes_and_labels[video_idx].append(
|
195 |
+
boxes_and_labels[video_idx][sec]
|
196 |
+
)
|
197 |
+
sec_idx += 1
|
198 |
+
count += 1
|
199 |
+
logger.info("%d keyframes used." % count)
|
200 |
+
|
201 |
+
return keyframe_indices, keyframe_boxes_and_labels
|
202 |
+
|
203 |
+
|
204 |
+
def get_num_boxes_used(keyframe_indices, keyframe_boxes_and_labels):
|
205 |
+
"""
|
206 |
+
Get total number of used boxes.
|
207 |
+
|
208 |
+
Args:
|
209 |
+
keyframe_indices (list): a list of indices of the keyframes.
|
210 |
+
keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from
|
211 |
+
video_idx and sec_idx to a list of boxes and corresponding labels.
|
212 |
+
|
213 |
+
Returns:
|
214 |
+
count (int): total number of used boxes.
|
215 |
+
"""
|
216 |
+
|
217 |
+
count = 0
|
218 |
+
for video_idx, sec_idx, _, _ in keyframe_indices:
|
219 |
+
count += len(keyframe_boxes_and_labels[video_idx][sec_idx])
|
220 |
+
return count
|
221 |
+
|
222 |
+
|
223 |
+
def get_max_objs(keyframe_indices, keyframe_boxes_and_labels):
|
224 |
+
# max_objs = 0
|
225 |
+
# for video_idx, sec_idx, _, _ in keyframe_indices:
|
226 |
+
# num_boxes = len(keyframe_boxes_and_labels[video_idx][sec_idx])
|
227 |
+
# if num_boxes > max_objs:
|
228 |
+
# max_objs = num_boxes
|
229 |
+
|
230 |
+
# return max_objs
|
231 |
+
return 25 #### MODIFICATION FOR NOW! TODO: FIX LATER!
|
dataset/transforms.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torchvision.transforms.functional as F
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
|
8 |
+
# Augmentation for Training
|
9 |
+
class Augmentation(object):
|
10 |
+
def __init__(self, img_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5):
|
11 |
+
self.img_size = img_size
|
12 |
+
self.jitter = jitter
|
13 |
+
self.hue = hue
|
14 |
+
self.saturation = saturation
|
15 |
+
self.exposure = exposure
|
16 |
+
|
17 |
+
|
18 |
+
def rand_scale(self, s):
|
19 |
+
scale = random.uniform(1, s)
|
20 |
+
|
21 |
+
if random.randint(0, 1):
|
22 |
+
return scale
|
23 |
+
|
24 |
+
return 1./scale
|
25 |
+
|
26 |
+
|
27 |
+
def random_distort_image(self, video_clip):
|
28 |
+
dhue = random.uniform(-self.hue, self.hue)
|
29 |
+
dsat = self.rand_scale(self.saturation)
|
30 |
+
dexp = self.rand_scale(self.exposure)
|
31 |
+
|
32 |
+
video_clip_ = []
|
33 |
+
for image in video_clip:
|
34 |
+
image = image.convert('HSV')
|
35 |
+
cs = list(image.split())
|
36 |
+
cs[1] = cs[1].point(lambda i: i * dsat)
|
37 |
+
cs[2] = cs[2].point(lambda i: i * dexp)
|
38 |
+
|
39 |
+
def change_hue(x):
|
40 |
+
x += dhue * 255
|
41 |
+
if x > 255:
|
42 |
+
x -= 255
|
43 |
+
if x < 0:
|
44 |
+
x += 255
|
45 |
+
return x
|
46 |
+
|
47 |
+
cs[0] = cs[0].point(change_hue)
|
48 |
+
image = Image.merge(image.mode, tuple(cs))
|
49 |
+
|
50 |
+
image = image.convert('RGB')
|
51 |
+
|
52 |
+
video_clip_.append(image)
|
53 |
+
|
54 |
+
return video_clip_
|
55 |
+
|
56 |
+
|
57 |
+
def random_crop(self, video_clip, width, height):
|
58 |
+
dw =int(width * self.jitter)
|
59 |
+
dh =int(height * self.jitter)
|
60 |
+
|
61 |
+
pleft = random.randint(-dw, dw)
|
62 |
+
pright = random.randint(-dw, dw)
|
63 |
+
ptop = random.randint(-dh, dh)
|
64 |
+
pbot = random.randint(-dh, dh)
|
65 |
+
|
66 |
+
swidth = width - pleft - pright
|
67 |
+
sheight = height - ptop - pbot
|
68 |
+
|
69 |
+
sx = float(swidth) / width
|
70 |
+
sy = float(sheight) / height
|
71 |
+
|
72 |
+
dx = (float(pleft) / width)/sx
|
73 |
+
dy = (float(ptop) / height)/sy
|
74 |
+
|
75 |
+
# random crop
|
76 |
+
cropped_clip = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in video_clip]
|
77 |
+
|
78 |
+
return cropped_clip, dx, dy, sx, sy
|
79 |
+
|
80 |
+
|
81 |
+
def apply_bbox(self, target, ow, oh, dx, dy, sx, sy):
|
82 |
+
sx, sy = 1./sx, 1./sy
|
83 |
+
# apply deltas on bbox
|
84 |
+
target[..., 0] = np.minimum(0.999, np.maximum(0, target[..., 0] / ow * sx - dx))
|
85 |
+
target[..., 1] = np.minimum(0.999, np.maximum(0, target[..., 1] / oh * sy - dy))
|
86 |
+
target[..., 2] = np.minimum(0.999, np.maximum(0, target[..., 2] / ow * sx - dx))
|
87 |
+
target[..., 3] = np.minimum(0.999, np.maximum(0, target[..., 3] / oh * sy - dy))
|
88 |
+
|
89 |
+
# refine target
|
90 |
+
refine_target = []
|
91 |
+
for i in range(target.shape[0]):
|
92 |
+
tgt = target[i]
|
93 |
+
bw = (tgt[2] - tgt[0]) * ow
|
94 |
+
bh = (tgt[3] - tgt[1]) * oh
|
95 |
+
|
96 |
+
if bw < 1. or bh < 1.:
|
97 |
+
continue
|
98 |
+
|
99 |
+
refine_target.append(tgt)
|
100 |
+
|
101 |
+
refine_target = np.array(refine_target).reshape(-1, target.shape[-1])
|
102 |
+
|
103 |
+
return refine_target
|
104 |
+
|
105 |
+
|
106 |
+
def to_tensor(self, video_clip):
|
107 |
+
return [F.to_tensor(image) * 255. for image in video_clip]
|
108 |
+
|
109 |
+
|
110 |
+
def __call__(self, video_clip, target):
|
111 |
+
# Initialize Random Variables
|
112 |
+
oh = video_clip[0].height
|
113 |
+
ow = video_clip[0].width
|
114 |
+
|
115 |
+
# random crop
|
116 |
+
video_clip, dx, dy, sx, sy = self.random_crop(video_clip, ow, oh)
|
117 |
+
|
118 |
+
# resize
|
119 |
+
video_clip = [img.resize([self.img_size, self.img_size]) for img in video_clip]
|
120 |
+
|
121 |
+
# random flip
|
122 |
+
flip = random.randint(0, 1)
|
123 |
+
if flip:
|
124 |
+
video_clip = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in video_clip]
|
125 |
+
|
126 |
+
# distort
|
127 |
+
video_clip = self.random_distort_image(video_clip)
|
128 |
+
|
129 |
+
# process target
|
130 |
+
if target is not None:
|
131 |
+
target = self.apply_bbox(target, ow, oh, dx, dy, sx, sy)
|
132 |
+
if flip:
|
133 |
+
target[..., [0, 2]] = 1.0 - target[..., [2, 0]]
|
134 |
+
else:
|
135 |
+
target = np.array([])
|
136 |
+
|
137 |
+
# to tensor
|
138 |
+
video_clip = self.to_tensor(video_clip)
|
139 |
+
target = torch.as_tensor(target).float()
|
140 |
+
|
141 |
+
return video_clip, target
|
142 |
+
|
143 |
+
|
144 |
+
# Transform for Testing
|
145 |
+
class BaseTransform(object):
|
146 |
+
def __init__(self, img_size=224, ):
|
147 |
+
self.img_size = img_size
|
148 |
+
|
149 |
+
|
150 |
+
def to_tensor(self, video_clip):
|
151 |
+
return [F.to_tensor(image) * 255. for image in video_clip]
|
152 |
+
|
153 |
+
|
154 |
+
def __call__(self, video_clip, target=None, normalize=True):
|
155 |
+
oh = video_clip[0].height
|
156 |
+
ow = video_clip[0].width
|
157 |
+
|
158 |
+
# resize
|
159 |
+
video_clip = [img.resize([self.img_size, self.img_size]) for img in video_clip]
|
160 |
+
|
161 |
+
# normalize target
|
162 |
+
# if target is not None:
|
163 |
+
# if normalize:
|
164 |
+
# target[..., [0, 2]] /= ow
|
165 |
+
# target[..., [1, 3]] /= oh
|
166 |
+
|
167 |
+
# else:
|
168 |
+
# target = np.array([])
|
169 |
+
|
170 |
+
# to tensor
|
171 |
+
video_clip = self.to_tensor(video_clip)
|
172 |
+
#target = torch.as_tensor(target).float()
|
173 |
+
|
174 |
+
#return video_clip, target
|
175 |
+
return video_clip
|
176 |
+
|
dataset/ucf24_demo/v_Basketball_g01_c02.mp4
ADDED
Binary file (514 kB). View file
|
|
dataset/ucf24_demo/v_Basketball_g07_c04.mp4
ADDED
Binary file (829 kB). View file
|
|
dataset/ucf24_demo/v_Biking_g01_c01.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:069914984bf53b5dbf4b24fbf7d79288f3697a35e494635b9ab48e3c800aea59
|
3 |
+
size 1703798
|
dataset/ucf24_demo/v_CliffDiving_g03_c01.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8975e99f1199731d8d55eacb6e0d633275618e6b2225c26e64e1e42396beb47
|
3 |
+
size 1024051
|
dataset/ucf24_demo/v_Fencing_g01_c06.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:529c6738bc866a1ef2c3a14fb8e4538c91d3344298bee2d0714dca859099cc5d
|
3 |
+
size 1403751
|
dataset/ucf24_demo/v_HorseRiding_g01_c03.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e33ea8d44b15086ceb7f905f9c059d97d8518dedb49eff4718f64c746d463c1e
|
3 |
+
size 1527353
|
dataset/ucf24_demo/v_IceDancing_g02_c05.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5bb209ed584117352e8f6d9d0cd5587aaae0f8e23b323f11ff18dbf6fe179388
|
3 |
+
size 1503889
|
dataset/ucf24_demo/v_SalsaSpin_g03_c01.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:776bc11f53b18f8ec4a9b5bbbf9c01c7d95c7afce5dda13aec01dd1c9749a8e0
|
3 |
+
size 1477281
|
dataset/ucf24_demo/v_SkateBoarding_g02_c01.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c428046991043c8fb4736d27356a3b2ea59a512d17fc821ed5333a7058ecfc7
|
3 |
+
size 3293243
|
dataset/ucf_jhmdb.py
ADDED
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
import os
|
5 |
+
import random
|
6 |
+
import numpy as np
|
7 |
+
import glob
|
8 |
+
|
9 |
+
import torch
|
10 |
+
from torch.utils.data import Dataset
|
11 |
+
from PIL import Image
|
12 |
+
|
13 |
+
|
14 |
+
# Dataset for UCF24 & JHMDB
|
15 |
+
class UCF_JHMDB_Dataset(Dataset):
|
16 |
+
def __init__(self,
|
17 |
+
data_root,
|
18 |
+
dataset='ucf24',
|
19 |
+
img_size=224,
|
20 |
+
transform=None,
|
21 |
+
is_train=False,
|
22 |
+
len_clip=16,
|
23 |
+
sampling_rate=1):
|
24 |
+
self.data_root = data_root
|
25 |
+
self.dataset = dataset
|
26 |
+
self.transform = transform
|
27 |
+
self.is_train = is_train
|
28 |
+
|
29 |
+
self.img_size = img_size
|
30 |
+
self.len_clip = len_clip
|
31 |
+
self.sampling_rate = sampling_rate
|
32 |
+
|
33 |
+
if self.is_train:
|
34 |
+
self.split_list = 'trainlist.txt'
|
35 |
+
else:
|
36 |
+
self.split_list = 'testlist.txt'
|
37 |
+
|
38 |
+
# load data
|
39 |
+
with open(os.path.join(data_root, self.split_list), 'r') as file:
|
40 |
+
self.file_names = file.readlines()
|
41 |
+
self.num_samples = len(self.file_names)
|
42 |
+
|
43 |
+
if dataset == 'ucf24':
|
44 |
+
self.num_classes = 24
|
45 |
+
elif dataset == 'jhmdb21':
|
46 |
+
self.num_classes = 21
|
47 |
+
|
48 |
+
|
49 |
+
def __len__(self):
|
50 |
+
return self.num_samples
|
51 |
+
|
52 |
+
|
53 |
+
def __getitem__(self, index):
|
54 |
+
# load a data
|
55 |
+
frame_idx, video_clip, target = self.pull_item(index)
|
56 |
+
|
57 |
+
return frame_idx, video_clip, target
|
58 |
+
|
59 |
+
|
60 |
+
def pull_item(self, index):
|
61 |
+
""" load a data """
|
62 |
+
assert index <= len(self), 'index range error'
|
63 |
+
image_path = self.file_names[index].rstrip()
|
64 |
+
|
65 |
+
img_split = image_path.split('/') # ex. ['labels', 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
|
66 |
+
# image name
|
67 |
+
img_id = int(img_split[-1][:5])
|
68 |
+
|
69 |
+
# path to label
|
70 |
+
label_path = os.path.join(self.data_root, img_split[0], img_split[1], img_split[2], '{:05d}.txt'.format(img_id))
|
71 |
+
|
72 |
+
# image folder
|
73 |
+
img_folder = os.path.join(self.data_root, 'rgb-images', img_split[1], img_split[2])
|
74 |
+
|
75 |
+
# frame numbers
|
76 |
+
if self.dataset == 'ucf24':
|
77 |
+
max_num = len(os.listdir(img_folder))
|
78 |
+
elif self.dataset == 'jhmdb21':
|
79 |
+
max_num = len(os.listdir(img_folder)) - 1
|
80 |
+
|
81 |
+
# sampling rate
|
82 |
+
if self.is_train:
|
83 |
+
d = random.randint(1, 2)
|
84 |
+
else:
|
85 |
+
d = self.sampling_rate
|
86 |
+
|
87 |
+
# load images
|
88 |
+
video_clip = []
|
89 |
+
for i in reversed(range(self.len_clip)):
|
90 |
+
# make it as a loop
|
91 |
+
img_id_temp = img_id - i * d
|
92 |
+
if img_id_temp < 1:
|
93 |
+
img_id_temp = 1
|
94 |
+
elif img_id_temp > max_num:
|
95 |
+
img_id_temp = max_num
|
96 |
+
|
97 |
+
# load a frame
|
98 |
+
if self.dataset == 'ucf24':
|
99 |
+
path_tmp = os.path.join(self.data_root, 'rgb-images', img_split[1], img_split[2] ,'{:05d}.jpg'.format(img_id_temp))
|
100 |
+
elif self.dataset == 'jhmdb21':
|
101 |
+
path_tmp = os.path.join(self.data_root, 'rgb-images', img_split[1], img_split[2] ,'{:05d}.png'.format(img_id_temp))
|
102 |
+
frame = Image.open(path_tmp).convert('RGB')
|
103 |
+
ow, oh = frame.width, frame.height
|
104 |
+
|
105 |
+
video_clip.append(frame)
|
106 |
+
|
107 |
+
frame_id = img_split[1] + '_' +img_split[2] + '_' + img_split[3]
|
108 |
+
|
109 |
+
# load an annotation
|
110 |
+
if os.path.getsize(label_path):
|
111 |
+
target = np.loadtxt(label_path)
|
112 |
+
else:
|
113 |
+
target = None
|
114 |
+
|
115 |
+
# [label, x1, y1, x2, y2] -> [x1, y1, x2, y2, label]
|
116 |
+
label = target[..., :1]
|
117 |
+
boxes = target[..., 1:]
|
118 |
+
target = np.concatenate([boxes, label], axis=-1).reshape(-1, 5)
|
119 |
+
|
120 |
+
# transform
|
121 |
+
video_clip, target = self.transform(video_clip, target)
|
122 |
+
# List [T, 3, H, W] -> [3, T, H, W]
|
123 |
+
video_clip = torch.stack(video_clip, dim=1)
|
124 |
+
|
125 |
+
# reformat target
|
126 |
+
target = {
|
127 |
+
'boxes': target[:, :4].float(), # [N, 4]
|
128 |
+
'labels': target[:, -1].long() - 1, # [N,]
|
129 |
+
'orig_size': [ow, oh],
|
130 |
+
'video_idx':frame_id[:-10]
|
131 |
+
}
|
132 |
+
|
133 |
+
return frame_id, video_clip, target
|
134 |
+
|
135 |
+
|
136 |
+
def pull_anno(self, index):
|
137 |
+
""" load a data """
|
138 |
+
assert index <= len(self), 'index range error'
|
139 |
+
image_path = self.file_names[index].rstrip()
|
140 |
+
|
141 |
+
img_split = image_path.split('/') # ex. ['labels', 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
|
142 |
+
# image name
|
143 |
+
img_id = int(img_split[-1][:5])
|
144 |
+
|
145 |
+
# path to label
|
146 |
+
label_path = os.path.join(self.data_root, img_split[0], img_split[1], img_split[2], '{:05d}.txt'.format(img_id))
|
147 |
+
|
148 |
+
# load an annotation
|
149 |
+
target = np.loadtxt(label_path)
|
150 |
+
target = target.reshape(-1, 5)
|
151 |
+
|
152 |
+
return target
|
153 |
+
|
154 |
+
|
155 |
+
# Video Dataset for UCF24 & JHMDB
|
156 |
+
class UCF_JHMDB_VIDEO_Dataset(Dataset):
|
157 |
+
def __init__(self,
|
158 |
+
data_root,
|
159 |
+
dataset='ucf24',
|
160 |
+
img_size=224,
|
161 |
+
transform=None,
|
162 |
+
len_clip=16,
|
163 |
+
sampling_rate=1):
|
164 |
+
self.data_root = data_root
|
165 |
+
self.dataset = dataset
|
166 |
+
self.transform = transform
|
167 |
+
|
168 |
+
self.img_size = img_size
|
169 |
+
self.len_clip = len_clip
|
170 |
+
self.sampling_rate = sampling_rate
|
171 |
+
|
172 |
+
if dataset == 'ucf24':
|
173 |
+
self.num_classes = 24
|
174 |
+
elif dataset == 'jhmdb21':
|
175 |
+
self.num_classes = 21
|
176 |
+
|
177 |
+
|
178 |
+
def set_video_data(self, line):
|
179 |
+
self.line = line
|
180 |
+
|
181 |
+
# load a video
|
182 |
+
self.img_folder = os.path.join(self.data_root, 'rgb-images', self.line)
|
183 |
+
|
184 |
+
if self.dataset == 'ucf24':
|
185 |
+
self.label_paths = sorted(glob.glob(os.path.join(self.img_folder, '*.jpg')))
|
186 |
+
elif self.dataset == 'jhmdb21':
|
187 |
+
self.label_paths = sorted(glob.glob(os.path.join(self.img_folder, '*.png')))
|
188 |
+
|
189 |
+
|
190 |
+
def __len__(self):
|
191 |
+
return len(self.label_paths)
|
192 |
+
|
193 |
+
|
194 |
+
def __getitem__(self, index):
|
195 |
+
return self.pull_item(index)
|
196 |
+
|
197 |
+
|
198 |
+
def pull_item(self, index):
|
199 |
+
image_path = self.label_paths[index]
|
200 |
+
|
201 |
+
video_split = self.line.split('/')
|
202 |
+
video_class = video_split[0]
|
203 |
+
video_file = video_split[1]
|
204 |
+
# for windows:
|
205 |
+
# img_split = image_path.split('\\') # ex. [..., 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
|
206 |
+
# for linux
|
207 |
+
img_split = image_path.split('/') # ex. [..., 'Basketball', 'v_Basketball_g08_c01', '00070.txt']
|
208 |
+
|
209 |
+
# image name
|
210 |
+
img_id = int(img_split[-1][:5])
|
211 |
+
max_num = len(os.listdir(self.img_folder))
|
212 |
+
if self.dataset == 'ucf24':
|
213 |
+
img_name = os.path.join(video_class, video_file, '{:05d}.jpg'.format(img_id))
|
214 |
+
elif self.dataset == 'jhmdb21':
|
215 |
+
img_name = os.path.join(video_class, video_file, '{:05d}.png'.format(img_id))
|
216 |
+
|
217 |
+
# load video clip
|
218 |
+
video_clip = []
|
219 |
+
for i in reversed(range(self.len_clip)):
|
220 |
+
# make it as a loop
|
221 |
+
img_id_temp = img_id - i
|
222 |
+
if img_id_temp < 1:
|
223 |
+
img_id_temp = 1
|
224 |
+
elif img_id_temp > max_num:
|
225 |
+
img_id_temp = max_num
|
226 |
+
|
227 |
+
# load a frame
|
228 |
+
if self.dataset == 'ucf24':
|
229 |
+
path_tmp = os.path.join(self.data_root, 'rgb-images', video_class, video_file ,'{:05d}.jpg'.format(img_id_temp))
|
230 |
+
elif self.dataset == 'jhmdb21':
|
231 |
+
path_tmp = os.path.join(self.data_root, 'rgb-images', video_class, video_file ,'{:05d}.png'.format(img_id_temp))
|
232 |
+
frame = Image.open(path_tmp).convert('RGB')
|
233 |
+
ow, oh = frame.width, frame.height
|
234 |
+
|
235 |
+
video_clip.append(frame)
|
236 |
+
|
237 |
+
# transform
|
238 |
+
video_clip, _ = self.transform(video_clip, normalize=False)
|
239 |
+
# List [T, 3, H, W] -> [3, T, H, W]
|
240 |
+
video_clip = torch.stack(video_clip, dim=1)
|
241 |
+
orig_size = [ow, oh] # width, height
|
242 |
+
|
243 |
+
target = {'orig_size': [ow, oh]}
|
244 |
+
|
245 |
+
return img_name, video_clip, target
|
246 |
+
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
if __name__ == '__main__':
|
251 |
+
import cv2
|
252 |
+
from transforms import Augmentation, BaseTransform
|
253 |
+
|
254 |
+
data_root = 'D:/python_work/spatial-temporal_action_detection/dataset/ucf24'
|
255 |
+
dataset = 'ucf24'
|
256 |
+
is_train = True
|
257 |
+
img_size = 224
|
258 |
+
len_clip = 16
|
259 |
+
trans_config = {
|
260 |
+
'jitter': 0.2,
|
261 |
+
'hue': 0.1,
|
262 |
+
'saturation': 1.5,
|
263 |
+
'exposure': 1.5
|
264 |
+
}
|
265 |
+
train_transform = Augmentation(
|
266 |
+
img_size=img_size,
|
267 |
+
jitter=trans_config['jitter'],
|
268 |
+
saturation=trans_config['saturation'],
|
269 |
+
exposure=trans_config['exposure']
|
270 |
+
)
|
271 |
+
val_transform = BaseTransform(img_size=img_size)
|
272 |
+
|
273 |
+
train_dataset = UCF_JHMDB_Dataset(
|
274 |
+
data_root=data_root,
|
275 |
+
dataset=dataset,
|
276 |
+
img_size=img_size,
|
277 |
+
transform=train_transform,
|
278 |
+
is_train=is_train,
|
279 |
+
len_clip=len_clip,
|
280 |
+
sampling_rate=1
|
281 |
+
)
|
282 |
+
|
283 |
+
print(len(train_dataset))
|
284 |
+
for i in range(len(train_dataset)):
|
285 |
+
frame_id, video_clip, target = train_dataset[i]
|
286 |
+
key_frame = video_clip[:, -1, :, :]
|
287 |
+
|
288 |
+
# to numpy
|
289 |
+
key_frame = key_frame.permute(1, 2, 0).numpy()
|
290 |
+
key_frame = key_frame.astype(np.uint8)
|
291 |
+
|
292 |
+
# to BGR
|
293 |
+
key_frame = key_frame[..., (2, 1, 0)]
|
294 |
+
H, W, C = key_frame.shape
|
295 |
+
|
296 |
+
key_frame = key_frame.copy()
|
297 |
+
bboxes = target['boxes']
|
298 |
+
labels = target['labels']
|
299 |
+
|
300 |
+
for box, cls_id in zip(bboxes, labels):
|
301 |
+
x1, y1, x2, y2 = box
|
302 |
+
x1 = int(x1 * W)
|
303 |
+
y1 = int(y1 * H)
|
304 |
+
x2 = int(x2 * W)
|
305 |
+
y2 = int(y2 * H)
|
306 |
+
key_frame = cv2.rectangle(key_frame, (x1, y1), (x2, y2), (255, 0, 0))
|
307 |
+
|
308 |
+
# cv2 show
|
309 |
+
cv2.imshow('key frame', key_frame)
|
310 |
+
cv2.waitKey(0)
|
311 |
+
|
evaluator/__init__.py
ADDED
File without changes
|
evaluator/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (133 Bytes). View file
|
|
evaluator/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (127 Bytes). View file
|
|
evaluator/__pycache__/ava_eval_helper.cpython-310.pyc
ADDED
Binary file (7.23 kB). View file
|
|
evaluator/__pycache__/ava_eval_helper.cpython-37.pyc
ADDED
Binary file (7.07 kB). View file
|
|
evaluator/__pycache__/ava_evaluator.cpython-310.pyc
ADDED
Binary file (6.75 kB). View file
|
|
evaluator/__pycache__/ava_evaluator.cpython-37.pyc
ADDED
Binary file (6.67 kB). View file
|
|
evaluator/__pycache__/cal_frame_mAP.cpython-310.pyc
ADDED
Binary file (26.6 kB). View file
|
|
evaluator/__pycache__/cal_frame_mAP.cpython-37.pyc
ADDED
Binary file (27.3 kB). View file
|
|
evaluator/__pycache__/cal_video_mAP.cpython-310.pyc
ADDED
Binary file (7.81 kB). View file
|
|
evaluator/__pycache__/cal_video_mAP.cpython-37.pyc
ADDED
Binary file (8.03 kB). View file
|
|
evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-310.pyc
ADDED
Binary file (5.99 kB). View file
|
|
evaluator/__pycache__/ucf_jhmdb_evaluator.cpython-37.pyc
ADDED
Binary file (5.89 kB). View file
|
|
evaluator/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (3.82 kB). View file
|
|
evaluator/__pycache__/utils.cpython-37.pyc
ADDED
Binary file (3.82 kB). View file
|
|