training_args = TrainingArguments( output_dir='bart-base-newsela-biendata-with-domain-adaptation', num_train_epochs=20, warmup_steps=250, per_device_train_batch_size=BATCH_SIZE, weight_decay=0.01, learning_rate=2e-4, fp16=True, optim="adafactor", ) Step Training Loss 500 599.802100 1000 367.138000 1500 8.558800 2000 2.705000 TrainOutput(global_step=2320, training_loss=211.1473583352977, metrics={'train_runtime': 976.5416, 'train_samples_per_second': 303.807, 'train_steps_per_second': 2.376, 'total_flos': 0.0, 'train_loss': 211.1473583352977, 'epoch': 20.0})