|
|
|
|
|
|
|
model: |
|
_target_: sam2.modeling.sam2_base.SAM2Base |
|
image_encoder: |
|
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder |
|
scalp: 1 |
|
trunk: |
|
_target_: sam2.modeling.backbones.hieradet.Hiera |
|
embed_dim: 96 |
|
num_heads: 1 |
|
stages: [1, 2, 7, 2] |
|
global_att_blocks: [5, 7, 9] |
|
window_pos_embed_bkg_spatial_size: [7, 7] |
|
neck: |
|
_target_: sam2.modeling.backbones.image_encoder.FpnNeck |
|
position_encoding: |
|
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
|
num_pos_feats: 256 |
|
normalize: true |
|
scale: null |
|
temperature: 10000 |
|
d_model: 256 |
|
backbone_channel_list: [768, 384, 192, 96] |
|
fpn_top_down_levels: [2, 3] |
|
fpn_interp_model: nearest |
|
|
|
memory_attention: |
|
_target_: sam2.modeling.memory_attention.MemoryAttention |
|
d_model: 256 |
|
pos_enc_at_input: true |
|
layer: |
|
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer |
|
activation: relu |
|
dim_feedforward: 2048 |
|
dropout: 0.1 |
|
pos_enc_at_attn: false |
|
self_attention: |
|
_target_: sam2.modeling.sam.transformer.RoPEAttention |
|
rope_theta: 10000.0 |
|
feat_sizes: [32, 32] |
|
embedding_dim: 256 |
|
num_heads: 1 |
|
downsample_rate: 1 |
|
dropout: 0.1 |
|
d_model: 256 |
|
pos_enc_at_cross_attn_keys: true |
|
pos_enc_at_cross_attn_queries: false |
|
cross_attention: |
|
_target_: sam2.modeling.sam.transformer.RoPEAttention |
|
rope_theta: 10000.0 |
|
feat_sizes: [32, 32] |
|
rope_k_repeat: True |
|
embedding_dim: 256 |
|
num_heads: 1 |
|
downsample_rate: 1 |
|
dropout: 0.1 |
|
kv_in_dim: 64 |
|
num_layers: 4 |
|
|
|
memory_encoder: |
|
_target_: sam2.modeling.memory_encoder.MemoryEncoder |
|
out_dim: 64 |
|
position_encoding: |
|
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
|
num_pos_feats: 64 |
|
normalize: true |
|
scale: null |
|
temperature: 10000 |
|
mask_downsampler: |
|
_target_: sam2.modeling.memory_encoder.MaskDownSampler |
|
kernel_size: 3 |
|
stride: 2 |
|
padding: 1 |
|
fuser: |
|
_target_: sam2.modeling.memory_encoder.Fuser |
|
layer: |
|
_target_: sam2.modeling.memory_encoder.CXBlock |
|
dim: 256 |
|
kernel_size: 7 |
|
padding: 3 |
|
layer_scale_init_value: 1e-6 |
|
use_dwconv: True |
|
num_layers: 2 |
|
|
|
num_maskmem: 7 |
|
image_size: 1024 |
|
|
|
|
|
sigmoid_scale_for_mem_enc: 20.0 |
|
sigmoid_bias_for_mem_enc: -10.0 |
|
use_mask_input_as_output_without_sam: true |
|
|
|
directly_add_no_mem_embed: true |
|
|
|
use_high_res_features_in_sam: true |
|
|
|
multimask_output_in_sam: true |
|
|
|
iou_prediction_use_sigmoid: True |
|
|
|
use_obj_ptrs_in_encoder: true |
|
add_tpos_enc_to_obj_ptrs: false |
|
only_obj_ptrs_in_the_past_for_eval: true |
|
|
|
pred_obj_scores: true |
|
pred_obj_scores_mlp: true |
|
fixed_no_obj_ptr: true |
|
|
|
multimask_output_for_tracking: true |
|
use_multimask_token_for_obj_ptr: true |
|
multimask_min_pt_num: 0 |
|
multimask_max_pt_num: 1 |
|
use_mlp_for_obj_ptr_proj: true |
|
|
|
|
|
compile_image_encoder: False |
|
|