|
_BASE_: config.yaml |
|
MODEL: |
|
META_ARCHITECTURE: "CATSeg" |
|
BACKBONE: |
|
FREEZE_AT: 0 |
|
NAME: "D2SwinTransformer" |
|
SWIN: |
|
EMBED_DIM: 128 |
|
DEPTHS: [2, 2, 18] |
|
NUM_HEADS: [4, 8, 16] |
|
WINDOW_SIZE: 12 |
|
APE: False |
|
DROP_PATH_RATE: 0.3 |
|
PATCH_NORM: True |
|
PRETRAIN_IMG_SIZE: 384 |
|
OUT_FEATURES: ["res2", "res3", "res4"] |
|
WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" |
|
PIXEL_MEAN: [123.675, 116.280, 103.530] |
|
PIXEL_STD: [58.395, 57.120, 57.375] |
|
SEM_SEG_HEAD: |
|
NAME: "CATSegHead" |
|
IN_FEATURES: ["res2", "res3", "res4"] |
|
IGNORE_VALUE: 255 |
|
NUM_CLASSES: 171 |
|
TRAIN_CLASS_JSON: "datasets/coco.json" |
|
TEST_CLASS_JSON: "datasets/coco.json" |
|
CLIP_PRETRAINED: "ViT-L/14@336px" |
|
PROMPT_DEPTH: 0 |
|
PROMPT_LENGTH: 0 |
|
TEXT_AFFINITY_DIM: 768 |
|
TEXT_AFFINITY_PROJ_DIM: 128 |
|
APPEARANCE_AFFINITY_DIM: 512 |
|
APPEARANCE_AFFINITY_PROJ_DIM: 128 |
|
DECODER_DIMS: [64, 32] |
|
DECODER_AFFINITY_DIMS: [256, 128] |
|
DECODER_AFFINITY_PROJ_DIMS: [32, 16] |
|
NUM_LAYERS: 2 |
|
NUM_HEADS: 4 |
|
HIDDEN_DIMS: 128 |
|
POOLING_SIZES: [2, 2] |
|
FEATURE_RESOLUTION: [24, 24] |
|
WINDOW_SIZES: 12 |
|
ATTENTION_TYPE: "linear" |
|
CLIP_FINETUNE: "attention" |
|
PROMPT_ENSEMBLE_TYPE: "imagenet" |
|
INPUT: |
|
MIN_SIZE_TRAIN: (384, ) |
|
MIN_SIZE_TRAIN_SAMPLING: "choice" |
|
MIN_SIZE_TEST: 640 |
|
CROP: |
|
ENABLED: True |
|
TYPE: "absolute" |
|
SIZE: (384, 384) |
|
SIZE_DIVISIBILITY: 384 |
|
FORMAT: "RGB" |
|
DATASET_MAPPER_NAME: "mask_former_semantic" |
|
SOLVER: |
|
IMS_PER_BATCH: 4 |
|
LR_SCHEDULER_NAME: WarmupCosineLR |
|
BASE_LR: 0.0002 |
|
MAX_ITER: 80000 |
|
BACKBONE_MULTIPLIER: 0.0 |
|
CLIP_MULTIPLIER: 0.01 |
|
TEST: |
|
EVAL_PERIOD: 5000 |
|
|