{ "architectures": [ "X3DModel" ], "auto_map": { "AutoConfig": "configuration_x3d.X3DConfig", "AutoModel": "modeling_x3d.X3DModel" }, "cfg": { "AUG": { "AA_TYPE": "rand-m9-mstd0.5-inc1", "COLOR_JITTER": 0.4, "ENABLE": false, "GEN_MASK_LOADER": false, "INTERPOLATION": "bicubic", "MASK_FRAMES": false, "MASK_RATIO": 0.0, "MASK_TUBE": false, "MASK_WINDOW_SIZE": [ 8, 7, 7 ], "MAX_MASK_PATCHES_PER_BLOCK": null, "NUM_SAMPLE": 1, "RE_COUNT": 1, "RE_MODE": "pixel", "RE_PROB": 0.25, "RE_SPLIT": false }, "AVA": { "ANNOTATION_DIR": "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/", "BGR": false, "DETECTION_SCORE_THRESH": 0.9, "EXCLUSION_FILE": "ava_val_excluded_timestamps_v2.2.csv", "FRAME_DIR": "/mnt/fair-flash3-east/ava_trainval_frames.img/", "FRAME_LIST_DIR": "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/", "FULL_TEST_ON_VAL": false, "GROUNDTRUTH_FILE": "ava_val_v2.2.csv", "IMG_PROC_BACKEND": "cv2", "LABEL_MAP_FILE": "ava_action_list_v2.2_for_activitynet_2019.pbtxt", "TEST_FORCE_FLIP": false, "TEST_LISTS": [ "val.csv" ], "TEST_PREDICT_BOX_LISTS": [ "ava_val_predicted_boxes.csv" ], "TRAIN_GT_BOX_LISTS": [ "ava_train_v2.2.csv" ], "TRAIN_LISTS": [ "train.csv" ], "TRAIN_PCA_JITTER_ONLY": true, "TRAIN_PREDICT_BOX_LISTS": [], "TRAIN_USE_COLOR_AUGMENTATION": false }, "BENCHMARK": { "LOG_PERIOD": 100, "NUM_EPOCHS": 5, "SHUFFLE": true }, "BN": { "GLOBAL_SYNC": false, "NORM_TYPE": "sync_batchnorm", "NUM_BATCHES_PRECISE": 200, "NUM_SPLITS": 1, "NUM_SYNC_DEVICES": 1, "USE_PRECISE_STATS": true, "WEIGHT_DECAY": 0.0 }, "CONTRASTIVE": { "BN_MLP": false, "BN_SYNC_MLP": false, "DELTA_CLIPS_MAX": Infinity, "DELTA_CLIPS_MIN": -Infinity, "DIM": 128, "INTERP_MEMORY": false, "KNN_ON": true, "LENGTH": 239975, "LOCAL_SHUFFLE_BN": true, "MEM_TYPE": "1d", "MLP_DIM": 2048, "MOCO_MULTI_VIEW_QUEUE": false, "MOMENTUM": 0.5, "MOMENTUM_ANNEALING": false, "NUM_CLASSES_DOWNSTREAM": 400, "NUM_MLP_LAYERS": 1, "PREDICTOR_DEPTHS": [], "QUEUE_LEN": 65536, "SEQUENTIAL": false, "SIMCLR_DIST_ON": true, "SWAV_QEUE_LEN": 0, "T": 0.07, "TYPE": "mem" }, "DATA": { "COLOR_RND_GRAYSCALE": 0.0, "DECODING_BACKEND": "torchvision", "DECODING_SHORT_SIZE": 256, "DUMMY_LOAD": false, "ENSEMBLE_METHOD": "max", "IN22K_TRAINVAL": false, "IN22k_VAL_IN1K": "", "INPUT_CHANNEL_NUM": [ 3 ], "INV_UNIFORM_SAMPLE": true, "IN_VAL_CROP_RATIO": 0.875, "LOADER_CHUNK_OVERALL_SIZE": 0, "LOADER_CHUNK_SIZE": 0, "MEAN": [ 0.45, 0.45, 0.45 ], "MULTI_LABEL": true, "NUM_FRAMES": 16, "PATH_LABEL_SEPARATOR": " ", "PATH_PREFIX": "kabr/KABR/dataset/image", "PATH_TO_DATA_DIR": "kabr/KABR/annotation", "PATH_TO_PRELOAD_IMDB": "", "RANDOM_FLIP": true, "REVERSE_INPUT_CHANNEL": true, "SAMPLING_RATE": 5, "SKIP_ROWS": 0, "SSL_BLUR_SIGMA_MAX": [ 0.0, 2.0 ], "SSL_BLUR_SIGMA_MIN": [ 0.0, 0.1 ], "SSL_COLOR_BRI_CON_SAT": [ 0.2, 0.2, 0.2 ], "SSL_COLOR_HUE": 0.1, "SSL_COLOR_JITTER": true, "SSL_MOCOV2_AUG": false, "STD": [ 0.225, 0.225, 0.225 ], "TARGET_FPS": 30, "TEST_CROP_SIZE": 300, "TIME_DIFF_PROB": 0.0, "TRAIN_CROP_NUM_SPATIAL": 1, "TRAIN_CROP_NUM_TEMPORAL": 1, "TRAIN_CROP_SIZE": 300, "TRAIN_JITTER_ASPECT_RELATIVE": [], "TRAIN_JITTER_FPS": 0.0, "TRAIN_JITTER_MOTION_SHIFT": false, "TRAIN_JITTER_SCALES": [ 300, 400 ], "TRAIN_JITTER_SCALES_RELATIVE": [], "TRAIN_PCA_EIGVAL": [ 0.225, 0.224, 0.229 ], "TRAIN_PCA_EIGVEC": [ [ -0.5675, 0.7192, 0.4009 ], [ -0.5808, -0.0045, -0.814 ], [ -0.5836, -0.6948, 0.4203 ] ], "USE_OFFSET_SAMPLING": false }, "DATA_LOADER": { "ENABLE_MULTI_THREAD_DECODE": false, "NUM_WORKERS": 8, "PIN_MEMORY": true }, "DEMO": { "BUFFER_SIZE": 0, "CLIP_VIS_SIZE": 10, "COMMON_CLASS_NAMES": [ "watch (a person)", "talk to (e.g., self, a person, a group)", "listen to (a person)", "touch (an object)", "carry/hold (an object)", "walk", "sit", "lie/sleep", "bend/bow (at the waist)" ], "COMMON_CLASS_THRES": 0.7, "DETECTRON2_CFG": "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml", "DETECTRON2_THRESH": 0.9, "DETECTRON2_WEIGHTS": "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl", "DISPLAY_HEIGHT": 0, "DISPLAY_WIDTH": 0, "ENABLE": false, "FPS": 30, "GT_BOXES": "", "INPUT_FORMAT": "BGR", "INPUT_VIDEO": "kabr/KABR/dataset/video/G0103.mp4", "LABEL_FILE_PATH": "kabr/KABR/annotation/classes.json", "NUM_CLIPS_SKIP": 1, "NUM_VIS_INSTANCES": 1, "OUTPUT_FILE": "kabr/KABR/dataset/predict/G0103.mp4", "OUTPUT_FPS": -1, "PREDS_BOXES": "", "SLOWMO": 1, "STARTING_SECOND": 900, "THREAD_ENABLE": false, "UNCOMMON_CLASS_THRES": 0.3, "VIS_MODE": "thres", "WEBCAM": -1 }, "DETECTION": { "ALIGNED": true, "ENABLE": false, "ROI_XFORM_RESOLUTION": 7, "SPATIAL_SCALE_FACTOR": 16 }, "DIST_BACKEND": "nccl", "LOG_MODEL_INFO": true, "LOG_PERIOD": 10, "MASK": { "DECODER_DEPTH": 0, "DECODER_EMBED_DIM": 512, "DECODER_SEP_POS_EMBED": false, "DEC_KV_KERNEL": [], "DEC_KV_STRIDE": [], "ENABLE": false, "HEAD_TYPE": "separate", "MAE_ON": false, "MAE_RND_MASK": false, "NORM_PRED_PIXEL": true, "PER_FRAME_MASKING": false, "PRED_HOG": false, "PRETRAIN_DEPTH": [ 15 ], "SCALE_INIT_BY_DEPTH": false, "TIME_STRIDE_LOSS": true }, "MIXUP": { "ALPHA": 0.8, "CUTMIX_ALPHA": 1.0, "ENABLE": false, "LABEL_SMOOTH_VALUE": 0.1, "PROB": 1.0, "SWITCH_PROB": 0.5 }, "MODEL": { "ACT_CHECKPOINT": false, "ARCH": "x3d", "DETACH_FINAL_FC": false, "DROPCONNECT_RATE": 0.0, "DROPOUT_RATE": 0.5, "FC_INIT_STD": 0.01, "FP16_ALLREDUCE": false, "FROZEN_BN": false, "HEAD_ACT": "sigmoid", "LOSS_FUNC": "EQL", "MODEL_NAME": "X3D", "MULTI_PATHWAY_ARCH": [ "slowfast" ], "NUM_CLASSES": 8, "SINGLE_PATHWAY_ARCH": [ "2d", "c2d", "i3d", "slow", "x3d", "mvit", "maskmvit" ] }, "MULTIGRID": { "BN_BASE_SIZE": 8, "DEFAULT_B": 0, "DEFAULT_S": 0, "DEFAULT_T": 0, "EPOCH_FACTOR": 1.5, "EVAL_FREQ": 3, "LONG_CYCLE": false, "LONG_CYCLE_FACTORS": [ [ 0.25, 0.7071067811865476 ], [ 0.5, 0.7071067811865476 ], [ 0.5, 1 ], [ 1, 1 ] ], "LONG_CYCLE_SAMPLING_RATE": 0, "SHORT_CYCLE": false, "SHORT_CYCLE_FACTORS": [ 0.5, 0.7071067811865476 ] }, "MVIT": { "CLS_EMBED_ON": true, "DEPTH": 16, "DIM_MUL": [], "DIM_MUL_IN_ATT": false, "DROPOUT_RATE": 0.0, "DROPPATH_RATE": 0.1, "EMBED_DIM": 96, "HEAD_INIT_SCALE": 1.0, "HEAD_MUL": [], "LAYER_SCALE_INIT_VALUE": 0.0, "MLP_RATIO": 4.0, "MODE": "conv", "NORM": "layernorm", "NORM_STEM": false, "NUM_HEADS": 1, "PATCH_2D": false, "PATCH_KERNEL": [ 3, 7, 7 ], "PATCH_PADDING": [ 2, 4, 4 ], "PATCH_STRIDE": [ 2, 4, 4 ], "POOL_FIRST": false, "POOL_KVQ_KERNEL": null, "POOL_KV_STRIDE": [], "POOL_KV_STRIDE_ADAPTIVE": null, "POOL_Q_STRIDE": [], "QKV_BIAS": true, "REL_POS_SPATIAL": false, "REL_POS_TEMPORAL": false, "REL_POS_ZERO_INIT": false, "RESIDUAL_POOLING": false, "REV": { "BUFFER_LAYERS": [], "ENABLE": false, "PRE_Q_FUSION": "avg", "RESPATH_FUSE": "concat", "RES_PATH": "conv" }, "SEPARATE_QKV": false, "SEP_POS_EMBED": false, "USE_ABS_POS": true, "USE_FIXED_SINCOS_POS": false, "USE_MEAN_POOLING": false, "ZERO_DECAY_POS_CLS": true }, "NONLOCAL": { "GROUP": [ [ 1 ], [ 1 ], [ 1 ], [ 1 ] ], "INSTANTIATION": "dot_product", "LOCATION": [ [ [] ], [ [] ], [ [] ], [ [] ] ], "POOL": [ [ [ 1, 2, 2 ], [ 1, 2, 2 ] ], [ [ 1, 2, 2 ], [ 1, 2, 2 ] ], [ [ 1, 2, 2 ], [ 1, 2, 2 ] ], [ [ 1, 2, 2 ], [ 1, 2, 2 ] ] ] }, "NUM_GPUS": 0, "NUM_SHARDS": 1, "OUTPUT_DIR": "kabr/KABR/logs/x3d-l-kabr", "RESNET": { "DEPTH": 50, "INPLACE_RELU": true, "NUM_BLOCK_TEMP_KERNEL": [ [ 3 ], [ 4 ], [ 6 ], [ 3 ] ], "NUM_GROUPS": 1, "SPATIAL_DILATIONS": [ [ 1 ], [ 1 ], [ 1 ], [ 1 ] ], "SPATIAL_STRIDES": [ [ 1 ], [ 2 ], [ 2 ], [ 2 ] ], "STRIDE_1X1": false, "TRANS_FUNC": "x3d_transform", "WIDTH_PER_GROUP": 64, "ZERO_INIT_FINAL_BN": true, "ZERO_INIT_FINAL_CONV": false }, "RNG_SEED": 0, "SHARD_ID": 0, "SLOWFAST": { "ALPHA": 8, "BETA_INV": 8, "FUSION_CONV_CHANNEL_RATIO": 2, "FUSION_KERNEL_SZ": 5 }, "SOLVER": { "BASE_LR": 0.05, "BASE_LR_SCALE_NUM_SHARDS": true, "BETAS": [ 0.9, 0.999 ], "CLIP_GRAD_L2NORM": null, "CLIP_GRAD_VAL": null, "COSINE_AFTER_WARMUP": false, "COSINE_END_LR": 0.0, "DAMPENING": 0.0, "GAMMA": 0.1, "LARS_ON": false, "LAYER_DECAY": 1.0, "LRS": [], "LR_POLICY": "cosine", "MAX_EPOCH": 120, "MOMENTUM": 0.9, "NESTEROV": true, "OPTIMIZING_METHOD": "sgd", "STEPS": [], "STEP_SIZE": 1, "WARMUP_EPOCHS": 35.0, "WARMUP_FACTOR": 0.1, "WARMUP_START_LR": 0.01, "WEIGHT_DECAY": 5e-05, "ZERO_WD_1D_PARAM": false }, "TASK": "", "TENSORBOARD": { "CATEGORIES_PATH": "", "CLASS_NAMES_PATH": "kabr/KABR/annotation/classes.json", "CONFUSION_MATRIX": { "ENABLE": true, "FIGSIZE": [ 8, 8 ], "SUBSET_PATH": "kabr/KABR/annotation/classes.txt" }, "ENABLE": true, "HISTOGRAM": { "ENABLE": true, "FIGSIZE": [ 8, 8 ], "SUBSET_PATH": "kabr/KABR/annotation/classes.txt", "TOPK": 3 }, "LOG_DIR": "", "MODEL_VIS": { "ACTIVATIONS": true, "COLORMAP": "Pastel2", "ENABLE": true, "GRAD_CAM": { "COLORMAP": "viridis", "ENABLE": true, "LAYER_LIST": [ "s5/pathway0_res14" ], "USE_TRUE_LABEL": false }, "INPUT_VIDEO": true, "LAYER_LIST": [ "s5/pathway0_res14" ], "MODEL_WEIGHTS": true, "TOPK_PREDS": 1 }, "PREDICTIONS_PATH": "", "WRONG_PRED_VIS": { "ENABLE": false, "SUBSET_PATH": "", "TAG": "Incorrectly classified videos." } }, "TEST": { "BATCH_SIZE": 64, "CHECKPOINT_FILE_PATH": "", "CHECKPOINT_TYPE": "pytorch", "DATASET": "charades", "ENABLE": false, "NUM_ENSEMBLE_VIEWS": 2, "NUM_SPATIAL_CROPS": 1, "NUM_TEMPORAL_CLIPS": [], "SAVE_RESULTS_PATH": "kabr/KABR/logs/x3d-l-kabr/results.txt" }, "TRAIN": { "AUTO_RESUME": true, "BATCH_SIZE": 64, "CHECKPOINT_CLEAR_NAME_PATTERN": [], "CHECKPOINT_EPOCH_RESET": true, "CHECKPOINT_FILE_PATH": "slowfast/projects/x3d/x3d_l.pyth", "CHECKPOINT_INFLATE": false, "CHECKPOINT_IN_INIT": false, "CHECKPOINT_PERIOD": 5, "CHECKPOINT_TYPE": "pytorch", "DATASET": "charades", "ENABLE": true, "EVAL_PERIOD": 5, "KILL_LOSS_EXPLOSION_FACTOR": 0.0, "MIXED_PRECISION": false }, "VIS_MASK": { "ENABLE": false }, "X3D": { "BN_LIN5": false, "BOTTLENECK_FACTOR": 2.25, "CHANNELWISE_3x3x3": true, "DEPTH_FACTOR": 5.0, "DIM_C1": 12, "DIM_C5": 2048, "SCALE_RES2": false, "WIDTH_FACTOR": 2.0 } }, "model_type": "x3d", "torch_dtype": "float32", "transformers_version": "4.46.0" }