{"dataset_name": "all_plant_genomes.upper.n.filtered.6_mers.1024_tok_len.overlap_50.32_folds.p0", "k_for_kmers": 6, "num_warmup_updates": 64000, "warmup_init_lr": 5e-05, "warmup_end_lr": 0.0001, "training_set_proportion": 0.95, "tokens_per_effective_batch": 1500000.0, "masking_ratio": 0.15, "masking_prob": 0.8, "mini_batch_size": 9, "num_tokens_per_checkpoint": "10000000000", "mixed_precision": true, "overlap": 50, "use_shift_dataset": true, "total_num_steps": 300000000, "grad_clipping": null, "alphabet_size": 4105, "pad_token_id": 1, "mask_token_id": 2, "class_token_id": 3, "eos_token_id": -1, "prepend_cls_token": true, "append_eos_token": false, "max_positions": 1024, "emb_layer_norm_before": false, "learned_positional_embedding": true, "roberta_lm_head": true, "add_bias_kv": false, "embed_dim": 1500, "attention_heads": 20, "key_dim": null, "ffn_embed_dim": 5120, "num_layers": 40, "token_dropout": false, "embed_scale": 1.0, "use_gradient_checkpointing": false, "architecture": "Transformer classic", "acc_batch_size_per_host": 2, "num_data_parallelism_ways": 128, "tokens_length": 1024, "mixed-precision": true, "model_num_parameters": "991 260 405", "model num parameters (in B)": 0.99, "num shards": 1, "num_shards": 1}