x3d / config.yml
zhong-al
Add model + config files
2c26ac8
AUG:
AA_TYPE: rand-m9-mstd0.5-inc1
COLOR_JITTER: 0.4
ENABLE: false
GEN_MASK_LOADER: false
INTERPOLATION: bicubic
MASK_FRAMES: false
MASK_RATIO: 0.0
MASK_TUBE: false
MASK_WINDOW_SIZE:
- 8
- 7
- 7
MAX_MASK_PATCHES_PER_BLOCK: null
NUM_SAMPLE: 1
RE_COUNT: 1
RE_MODE: pixel
RE_PROB: 0.25
RE_SPLIT: false
AVA:
ANNOTATION_DIR: /mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/
BGR: false
DETECTION_SCORE_THRESH: 0.9
EXCLUSION_FILE: ava_val_excluded_timestamps_v2.2.csv
FRAME_DIR: /mnt/fair-flash3-east/ava_trainval_frames.img/
FRAME_LIST_DIR: /mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/
FULL_TEST_ON_VAL: false
GROUNDTRUTH_FILE: ava_val_v2.2.csv
IMG_PROC_BACKEND: cv2
LABEL_MAP_FILE: ava_action_list_v2.2_for_activitynet_2019.pbtxt
TEST_FORCE_FLIP: false
TEST_LISTS:
- val.csv
TEST_PREDICT_BOX_LISTS:
- ava_val_predicted_boxes.csv
TRAIN_GT_BOX_LISTS:
- ava_train_v2.2.csv
TRAIN_LISTS:
- train.csv
TRAIN_PCA_JITTER_ONLY: true
TRAIN_PREDICT_BOX_LISTS: []
TRAIN_USE_COLOR_AUGMENTATION: false
BENCHMARK:
LOG_PERIOD: 100
NUM_EPOCHS: 5
SHUFFLE: true
BN:
GLOBAL_SYNC: false
NORM_TYPE: sync_batchnorm
NUM_BATCHES_PRECISE: 200
NUM_SPLITS: 1
NUM_SYNC_DEVICES: 1
USE_PRECISE_STATS: true
WEIGHT_DECAY: 0.0
CONTRASTIVE:
BN_MLP: false
BN_SYNC_MLP: false
DELTA_CLIPS_MAX: .inf
DELTA_CLIPS_MIN: -.inf
DIM: 128
INTERP_MEMORY: false
KNN_ON: true
LENGTH: 239975
LOCAL_SHUFFLE_BN: true
MEM_TYPE: 1d
MLP_DIM: 2048
MOCO_MULTI_VIEW_QUEUE: false
MOMENTUM: 0.5
MOMENTUM_ANNEALING: false
NUM_CLASSES_DOWNSTREAM: 400
NUM_MLP_LAYERS: 1
PREDICTOR_DEPTHS: []
QUEUE_LEN: 65536
SEQUENTIAL: false
SIMCLR_DIST_ON: true
SWAV_QEUE_LEN: 0
T: 0.07
TYPE: mem
DATA:
COLOR_RND_GRAYSCALE: 0.0
DECODING_BACKEND: torchvision
DECODING_SHORT_SIZE: 256
DUMMY_LOAD: false
ENSEMBLE_METHOD: max
IN22K_TRAINVAL: false
IN22k_VAL_IN1K: ''
INPUT_CHANNEL_NUM:
- 3
INV_UNIFORM_SAMPLE: true
IN_VAL_CROP_RATIO: 0.875
LOADER_CHUNK_OVERALL_SIZE: 0
LOADER_CHUNK_SIZE: 0
MEAN:
- 0.45
- 0.45
- 0.45
MULTI_LABEL: true
NUM_FRAMES: 16
PATH_LABEL_SEPARATOR: ' '
PATH_PREFIX: kabr/KABR/dataset/image
PATH_TO_DATA_DIR: kabr/KABR/annotation
PATH_TO_PRELOAD_IMDB: ''
RANDOM_FLIP: true
REVERSE_INPUT_CHANNEL: true
SAMPLING_RATE: 5
SKIP_ROWS: 0
SSL_BLUR_SIGMA_MAX:
- 0.0
- 2.0
SSL_BLUR_SIGMA_MIN:
- 0.0
- 0.1
SSL_COLOR_BRI_CON_SAT:
- 0.2
- 0.2
- 0.2
SSL_COLOR_HUE: 0.1
SSL_COLOR_JITTER: true
SSL_MOCOV2_AUG: false
STD:
- 0.225
- 0.225
- 0.225
TARGET_FPS: 30
TEST_CROP_SIZE: 300
TIME_DIFF_PROB: 0.0
TRAIN_CROP_NUM_SPATIAL: 1
TRAIN_CROP_NUM_TEMPORAL: 1
TRAIN_CROP_SIZE: 300
TRAIN_JITTER_ASPECT_RELATIVE: []
TRAIN_JITTER_FPS: 0.0
TRAIN_JITTER_MOTION_SHIFT: false
TRAIN_JITTER_SCALES:
- 300
- 400
TRAIN_JITTER_SCALES_RELATIVE: []
TRAIN_PCA_EIGVAL:
- 0.225
- 0.224
- 0.229
TRAIN_PCA_EIGVEC:
- - -0.5675
- 0.7192
- 0.4009
- - -0.5808
- -0.0045
- -0.814
- - -0.5836
- -0.6948
- 0.4203
USE_OFFSET_SAMPLING: false
DATA_LOADER:
ENABLE_MULTI_THREAD_DECODE: false
NUM_WORKERS: 8
PIN_MEMORY: true
DEMO:
BUFFER_SIZE: 0
CLIP_VIS_SIZE: 10
COMMON_CLASS_NAMES:
- watch (a person)
- talk to (e.g., self, a person, a group)
- listen to (a person)
- touch (an object)
- carry/hold (an object)
- walk
- sit
- lie/sleep
- bend/bow (at the waist)
COMMON_CLASS_THRES: 0.7
DETECTRON2_CFG: COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
DETECTRON2_THRESH: 0.9
DETECTRON2_WEIGHTS: detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl
DISPLAY_HEIGHT: 0
DISPLAY_WIDTH: 0
ENABLE: false
FPS: 30
GT_BOXES: ''
INPUT_FORMAT: BGR
INPUT_VIDEO: kabr/KABR/dataset/video/G0103.mp4
LABEL_FILE_PATH: kabr/KABR/annotation/classes.json
NUM_CLIPS_SKIP: 1
NUM_VIS_INSTANCES: 1
OUTPUT_FILE: kabr/KABR/dataset/predict/G0103.mp4
OUTPUT_FPS: -1
PREDS_BOXES: ''
SLOWMO: 1
STARTING_SECOND: 900
THREAD_ENABLE: false
UNCOMMON_CLASS_THRES: 0.3
VIS_MODE: thres
WEBCAM: -1
DETECTION:
ALIGNED: true
ENABLE: false
ROI_XFORM_RESOLUTION: 7
SPATIAL_SCALE_FACTOR: 16
DIST_BACKEND: nccl
LOG_MODEL_INFO: true
LOG_PERIOD: 10
MASK:
DECODER_DEPTH: 0
DECODER_EMBED_DIM: 512
DECODER_SEP_POS_EMBED: false
DEC_KV_KERNEL: []
DEC_KV_STRIDE: []
ENABLE: false
HEAD_TYPE: separate
MAE_ON: false
MAE_RND_MASK: false
NORM_PRED_PIXEL: true
PER_FRAME_MASKING: false
PRED_HOG: false
PRETRAIN_DEPTH:
- 15
SCALE_INIT_BY_DEPTH: false
TIME_STRIDE_LOSS: true
MIXUP:
ALPHA: 0.8
CUTMIX_ALPHA: 1.0
ENABLE: false
LABEL_SMOOTH_VALUE: 0.1
PROB: 1.0
SWITCH_PROB: 0.5
MODEL:
ACT_CHECKPOINT: false
ARCH: x3d
DETACH_FINAL_FC: false
DROPCONNECT_RATE: 0.0
DROPOUT_RATE: 0.5
FC_INIT_STD: 0.01
FP16_ALLREDUCE: false
FROZEN_BN: false
HEAD_ACT: sigmoid
LOSS_FUNC: EQL
MODEL_NAME: X3D
MULTI_PATHWAY_ARCH:
- slowfast
NUM_CLASSES: 8
SINGLE_PATHWAY_ARCH:
- 2d
- c2d
- i3d
- slow
- x3d
- mvit
- maskmvit
MULTIGRID:
BN_BASE_SIZE: 8
DEFAULT_B: 0
DEFAULT_S: 0
DEFAULT_T: 0
EPOCH_FACTOR: 1.5
EVAL_FREQ: 3
LONG_CYCLE: false
LONG_CYCLE_FACTORS:
- - 0.25
- 0.7071067811865476
- - 0.5
- 0.7071067811865476
- - 0.5
- 1
- - 1
- 1
LONG_CYCLE_SAMPLING_RATE: 0
SHORT_CYCLE: false
SHORT_CYCLE_FACTORS:
- 0.5
- 0.7071067811865476
MVIT:
CLS_EMBED_ON: true
DEPTH: 16
DIM_MUL: []
DIM_MUL_IN_ATT: false
DROPOUT_RATE: 0.0
DROPPATH_RATE: 0.1
EMBED_DIM: 96
HEAD_INIT_SCALE: 1.0
HEAD_MUL: []
LAYER_SCALE_INIT_VALUE: 0.0
MLP_RATIO: 4.0
MODE: conv
NORM: layernorm
NORM_STEM: false
NUM_HEADS: 1
PATCH_2D: false
PATCH_KERNEL:
- 3
- 7
- 7
PATCH_PADDING:
- 2
- 4
- 4
PATCH_STRIDE:
- 2
- 4
- 4
POOL_FIRST: false
POOL_KVQ_KERNEL: null
POOL_KV_STRIDE: []
POOL_KV_STRIDE_ADAPTIVE: null
POOL_Q_STRIDE: []
QKV_BIAS: true
REL_POS_SPATIAL: false
REL_POS_TEMPORAL: false
REL_POS_ZERO_INIT: false
RESIDUAL_POOLING: false
REV:
BUFFER_LAYERS: []
ENABLE: false
PRE_Q_FUSION: avg
RESPATH_FUSE: concat
RES_PATH: conv
SEPARATE_QKV: false
SEP_POS_EMBED: false
USE_ABS_POS: true
USE_FIXED_SINCOS_POS: false
USE_MEAN_POOLING: false
ZERO_DECAY_POS_CLS: true
NONLOCAL:
GROUP:
- - 1
- - 1
- - 1
- - 1
INSTANTIATION: dot_product
LOCATION:
- - []
- - []
- - []
- - []
POOL:
- - - 1
- 2
- 2
- - 1
- 2
- 2
- - - 1
- 2
- 2
- - 1
- 2
- 2
- - - 1
- 2
- 2
- - 1
- 2
- 2
- - - 1
- 2
- 2
- - 1
- 2
- 2
NUM_GPUS: 8
NUM_SHARDS: 1
OUTPUT_DIR: kabr/KABR/logs/x3d-l-kabr
RESNET:
DEPTH: 50
INPLACE_RELU: true
NUM_BLOCK_TEMP_KERNEL:
- - 3
- - 4
- - 6
- - 3
NUM_GROUPS: 1
SPATIAL_DILATIONS:
- - 1
- - 1
- - 1
- - 1
SPATIAL_STRIDES:
- - 1
- - 2
- - 2
- - 2
STRIDE_1X1: false
TRANS_FUNC: x3d_transform
WIDTH_PER_GROUP: 64
ZERO_INIT_FINAL_BN: true
ZERO_INIT_FINAL_CONV: false
RNG_SEED: 0
SHARD_ID: 0
SLOWFAST:
ALPHA: 8
BETA_INV: 8
FUSION_CONV_CHANNEL_RATIO: 2
FUSION_KERNEL_SZ: 5
SOLVER:
BASE_LR: 0.05
BASE_LR_SCALE_NUM_SHARDS: true
BETAS:
- 0.9
- 0.999
CLIP_GRAD_L2NORM: null
CLIP_GRAD_VAL: null
COSINE_AFTER_WARMUP: false
COSINE_END_LR: 0.0
DAMPENING: 0.0
GAMMA: 0.1
LARS_ON: false
LAYER_DECAY: 1.0
LRS: []
LR_POLICY: cosine
MAX_EPOCH: 120
MOMENTUM: 0.9
NESTEROV: true
OPTIMIZING_METHOD: sgd
STEPS: []
STEP_SIZE: 1
WARMUP_EPOCHS: 35.0
WARMUP_FACTOR: 0.1
WARMUP_START_LR: 0.01
WEIGHT_DECAY: 5.0e-05
ZERO_WD_1D_PARAM: false
TASK: ''
TENSORBOARD:
CATEGORIES_PATH: ''
CLASS_NAMES_PATH: kabr/KABR/annotation/classes.json
CONFUSION_MATRIX:
ENABLE: true
FIGSIZE:
- 8
- 8
SUBSET_PATH: kabr/KABR/annotation/classes.txt
ENABLE: true
HISTOGRAM:
ENABLE: true
FIGSIZE:
- 8
- 8
SUBSET_PATH: kabr/KABR/annotation/classes.txt
TOPK: 3
LOG_DIR: ''
MODEL_VIS:
ACTIVATIONS: true
COLORMAP: Pastel2
ENABLE: true
GRAD_CAM:
COLORMAP: viridis
ENABLE: true
LAYER_LIST:
- s5/pathway0_res14
USE_TRUE_LABEL: false
INPUT_VIDEO: true
LAYER_LIST:
- s5/pathway0_res14
MODEL_WEIGHTS: true
TOPK_PREDS: 1
PREDICTIONS_PATH: ''
WRONG_PRED_VIS:
ENABLE: false
SUBSET_PATH: ''
TAG: Incorrectly classified videos.
TEST:
BATCH_SIZE: 64
CHECKPOINT_FILE_PATH: ''
CHECKPOINT_TYPE: pytorch
DATASET: charades
ENABLE: false
NUM_ENSEMBLE_VIEWS: 2
NUM_SPATIAL_CROPS: 1
NUM_TEMPORAL_CLIPS: []
SAVE_RESULTS_PATH: kabr/KABR/logs/x3d-l-kabr/results.txt
TRAIN:
AUTO_RESUME: true
BATCH_SIZE: 64
CHECKPOINT_CLEAR_NAME_PATTERN: []
CHECKPOINT_EPOCH_RESET: true
CHECKPOINT_FILE_PATH: slowfast/projects/x3d/x3d_l.pyth
CHECKPOINT_INFLATE: false
CHECKPOINT_IN_INIT: false
CHECKPOINT_PERIOD: 5
CHECKPOINT_TYPE: pytorch
DATASET: charades
ENABLE: true
EVAL_PERIOD: 5
KILL_LOSS_EXPLOSION_FACTOR: 0.0
MIXED_PRECISION: false
VIS_MASK:
ENABLE: false
X3D:
BN_LIN5: false
BOTTLENECK_FACTOR: 2.25
CHANNELWISE_3x3x3: true
DEPTH_FACTOR: 5.0
DIM_C1: 12
DIM_C5: 2048
SCALE_RES2: false
WIDTH_FACTOR: 2.0