experiment_name: 'runs/transformer_huge' dataset: src_lang: 'lo' src_tokenizer: 'BPE' src_max_seq_len: 400 tgt_lang: 'vi' tgt_tokenizer: 'WordLevel' tgt_max_seq_len: 350 train_dataset: 'train_clean.dat' validate_dataset: 'dev_clean.dat' tokenizer_file: "tokenizer_{0}.json" bleu_dataset: 'test2023' model: # 61604879 parameters d_model: 512 num_heads: 8 d_ff: 2048 dropout_p: 0.15 num_encoder_layers: 8 num_decoder_layers: 4 model_folder: "weights" model_basename: "transformer_huge" preload: '_50' train: lr: 0.001 # 1e-2 batch_size: 32 num_epochs: 50 label_smoothing: 0.1 on_colab: False # are you training on Colab? patience: 1 warm_up_steps: 200