|
from transformers import PretrainedConfig |
|
|
|
class PATHViTConfig(PretrainedConfig): |
|
model_type = "vit-b16" |
|
|
|
def __init__( |
|
self, |
|
img_size=224, |
|
patch_size=16, |
|
in_chans=3, |
|
num_classes=80, |
|
embed_dim=768, |
|
depth=12, |
|
num_heads=12, |
|
mlp_ratio=4.0, |
|
qkv_bias=True, |
|
drop_path_rate=0.1, |
|
norm_layer=None, |
|
norm_layer_eps=1e-6, |
|
window=True, |
|
use_abs_pos_emb=True, |
|
interval=3, |
|
test_pos_mode=False, |
|
task_sp_list=(), |
|
neck_sp_list=(), |
|
learnable_pos=False, |
|
rel_pos_spatial=False, |
|
lms_checkpoint_train=False, |
|
prompt=None, |
|
pad_attn_mask=False, |
|
freeze_iters=0, |
|
act_layer="GELU", |
|
pre_ln=False, |
|
mask_input=False, |
|
ending_norm=True, |
|
round_padding=False, |
|
compat=False, |
|
use_cls_token=False, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.img_size = img_size |
|
self.patch_size = patch_size |
|
self.in_chans = in_chans |
|
self.num_classes = num_classes |
|
self.embed_dim = embed_dim |
|
self.depth = depth |
|
self.num_heads = num_heads |
|
self.mlp_ratio = mlp_ratio |
|
self.qkv_bias = qkv_bias |
|
self.drop_path_rate = drop_path_rate |
|
|
|
self.norm_layer = norm_layer |
|
self.norm_layer_eps = norm_layer_eps |
|
|
|
self.window = window |
|
self.use_abs_pos_emb = use_abs_pos_emb |
|
self.interval = interval |
|
self.test_pos_mode = test_pos_mode |
|
self.task_sp_list = task_sp_list |
|
self.neck_sp_list = neck_sp_list |
|
self.learnable_pos = learnable_pos |
|
self.rel_pos_spatial = rel_pos_spatial |
|
self.lms_checkpoint_train = lms_checkpoint_train |
|
self.prompt = prompt |
|
self.pad_attn_mask = pad_attn_mask |
|
self.freeze_iters = freeze_iters |
|
self.act_layer = act_layer |
|
self.pre_ln = pre_ln |
|
self.mask_input = mask_input |
|
self.ending_norm = ending_norm |
|
self.round_padding = round_padding |
|
self.compat = compat |
|
self.use_cls_token = use_cls_token |
|
|
|
|