{ | |
"force": true, | |
"dump_path": "./serialization_dir/distilbert", | |
"data_file": "data/binarized_text.bert.pickle", | |
"student_type": "distilbert", | |
"student_config": "training_configs/distilbert-base-uncased.json", | |
"student_pretrained_weights": "./serialization_dir/tf_bert-base-uncased_0247911.pth", | |
"teacher_type": "bert", | |
"teacher_name": "bert-base-uncased", | |
"temperature": 2.0, | |
"alpha_ce": 5.0, | |
"alpha_mlm": 2.0, | |
"alpha_clm": 0.0, | |
"alpha_mse": 0.0, | |
"alpha_cos": 1.0, | |
"alpha_act": 1.0, | |
"mlm": true, | |
"mlm_mask_prop": 0.15, | |
"word_mask": 0.8, | |
"word_keep": 0.1, | |
"word_rand": 0.1, | |
"mlm_smoothing": 0.7, | |
"token_counts": "data/token_counts.bert.pickle", | |
"restrict_ce_to_mask": false, | |
"freeze_pos_embs": true, | |
"freeze_token_type_embds": false, | |
"n_epoch": 50, | |
"batch_size": 5, | |
"group_by_size": true, | |
"gradient_accumulation_steps": 50, | |
"warmup_prop": 0.05, | |
"weight_decay": 0.0, | |
"learning_rate": 0.0005, | |
"adam_epsilon": 1e-06, | |
"max_grad_norm": 5.0, | |
"initializer_range": 0.02, | |
"fp16": false, | |
"fp16_opt_level": "O1", | |
"n_gpu": 1, | |
"local_rank": 0, | |
"seed": 56, | |
"log_interval": 500, | |
"checkpoint_interval": 4000, | |
"n_nodes": 1, | |
"node_id": 0, | |
"global_rank": 0, | |
"world_size": 1, | |
"n_gpu_per_node": 1, | |
"multi_gpu": false, | |
"is_master": true, | |
"multi_node": false | |
} |