File size: 3,180 Bytes
6710be9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
{
"_name_or_path": "/iris/u/jwxie/workspace/releases/domain-agnostic-pretraining/examples/saved_models/physics_pretrained/higgs_guided_self_rand_select_masking_recon_small_noise_mask_self_random_mix-normalized-adamw_torch-lr1e-4-wd0.01-ws10000-masking_schedule_length0.25-mr0.2",
"architectures": [
"SMAForSSL"
],
"attention_dropout_prob": 0.0,
"cross_attention_widening_factor": 1,
"cross_eval_noising_args": null,
"cross_train_noising_args": [
[
"RandomlySelectedCrossAttentionMasking",
{
"exclude_seen_reconstruction": true,
"head_aggregation": "random_mix",
"mask_self": true,
"masking_ratio": 0.2,
"num_per_query": 3,
"select_initial_ratio": 1.0,
"varying_length": true
}
]
],
"decoder_attention_channels": 128,
"decoder_heads": 1,
"decoder_latent_channels": 128,
"decoder_type": "cross_attention",
"dense_use_bias": true,
"drop_path_rate": 0.0,
"embedded_channels": 128,
"encoder_cross_attention_channels": 128,
"encoder_type": "self_attention",
"final_project": true,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"initializer_range": 0.02,
"input_channels": 1,
"input_type": "continuous",
"latent_channels": 128,
"layer_norm_eps": 1e-12,
"layernorm_eps": 1e-12,
"loss_fn": "mse",
"max_position_embeddings": 28,
"model_type": "perceiver_sma",
"num_blocks": 1,
"num_cross_attention_heads": 8,
"num_discrete_tokens": 262,
"num_latents": 128,
"num_outputs": 2048,
"num_self_attends_per_block": 4,
"num_self_attention_heads": 8,
"output_channels": 262,
"pe_initializer_range": 0.02,
"post_decoder_layers": null,
"project_after_concat": true,
"qk_channels": 128,
"self_attention_widening_factor": 1,
"share_decoder_queries": true,
"share_embedding_weights": true,
"teacher_args": {
"auxiliary_loss_fn": "mse",
"auxiliary_loss_weight": 1.0,
"ema_args": {
"ema_decay_end": 0.0,
"ema_decay_start": 0.0
},
"eval_transform_args": [
[
"RandomlySelectedCrossAttentionMasking",
{
"exclude_seen_reconstruction": true,
"head_aggregation": "random_mix",
"mask_self": true,
"masking_ratio": 0.2,
"num_per_query": 3,
"select_initial_ratio": 1.0,
"varying_length": true
}
]
],
"mask_replace": 3,
"num_layer_target_avg": null,
"reconstruction_decoder_args": {
"num_heads": 8,
"num_outputs": 28,
"output_channels": 1,
"qk_channels": 128,
"query_num_channels": 128,
"share_decoder_queries": true,
"share_embedding_weights": true,
"use_query_residual": true,
"v_channels": 128
},
"reconstruction_loss_fn": "mse",
"reconstruction_loss_weight": 1.0,
"reconstruction_weighted_loss": false,
"target_normalization_fn": "layernorm",
"train_transform_args": null
},
"teacher_name": "ReconstructionTeacher",
"torch_dtype": "float32",
"transformers_version": "4.26.0.dev0",
"use_decoder": false,
"use_position_embeddings": true,
"use_query_residual": true,
"v_channels": 128
}
|