JorgeDeC commited on
Commit
16a7fc9
1 Parent(s): 2e6ee6d

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1403 -3
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:812c87547653da5378c7dfc955f034d82fdc8ee88380df984e010eb35a04e70b
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2df44592754fe7188dd677ae0bc32281f8bcfad5f4a6c037e3ae72d516e0f92
3
  size 83946192
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6255ec5b843f16274c3a630ceda1999d9a6ff2db1125db8a13388a7d838f220
3
  size 168150290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d4416518671bd3a5927dad1918e6feb19b8689187dec1647b573a84566f45c4
3
  size 168150290
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44ab34006e4ff8e3ddb1b3e0970e22b7afa1b47af9f1338b5e8a38648238a8fe
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89b8115cb129c536e89826f236d6c4894075bda82a59ceefb56b1d1ccf0de0fb
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e97643a2c9855ef59e9a06836cda3285ce9299fbc4d864a30671a903ea3632d3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:167f19cbea1c5998444d9bc539623c9f6f524cce3680072969e6130b6d4da06a
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.34152498329497366,
5
  "eval_steps": 500,
6
- "global_step": 2300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3234,6 +3234,1406 @@
3234
  "learning_rate": 0.00016652690605760775,
3235
  "loss": 0.9739,
3236
  "step": 2300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3237
  }
3238
  ],
3239
  "logging_steps": 5,
@@ -3241,7 +4641,7 @@
3241
  "num_input_tokens_seen": 0,
3242
  "num_train_epochs": 1,
3243
  "save_steps": 100,
3244
- "total_flos": 3.2343958171460567e+18,
3245
  "train_batch_size": 2,
3246
  "trial_name": null,
3247
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4900141064667013,
5
  "eval_steps": 500,
6
+ "global_step": 3300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3234
  "learning_rate": 0.00016652690605760775,
3235
  "loss": 0.9739,
3236
  "step": 2300
3237
+ },
3238
+ {
3239
+ "epoch": 0.34,
3240
+ "grad_norm": 0.333984375,
3241
+ "learning_rate": 0.00016633315764368818,
3242
+ "loss": 0.9428,
3243
+ "step": 2305
3244
+ },
3245
+ {
3246
+ "epoch": 0.34,
3247
+ "grad_norm": 0.341796875,
3248
+ "learning_rate": 0.0001661389635479332,
3249
+ "loss": 0.938,
3250
+ "step": 2310
3251
+ },
3252
+ {
3253
+ "epoch": 0.34,
3254
+ "grad_norm": 0.35546875,
3255
+ "learning_rate": 0.00016594432507510175,
3256
+ "loss": 0.9647,
3257
+ "step": 2315
3258
+ },
3259
+ {
3260
+ "epoch": 0.34,
3261
+ "grad_norm": 0.333984375,
3262
+ "learning_rate": 0.00016574924353293845,
3263
+ "loss": 0.9413,
3264
+ "step": 2320
3265
+ },
3266
+ {
3267
+ "epoch": 0.35,
3268
+ "grad_norm": 0.3515625,
3269
+ "learning_rate": 0.0001655537202321649,
3270
+ "loss": 0.9443,
3271
+ "step": 2325
3272
+ },
3273
+ {
3274
+ "epoch": 0.35,
3275
+ "grad_norm": 0.341796875,
3276
+ "learning_rate": 0.00016535775648647075,
3277
+ "loss": 0.9407,
3278
+ "step": 2330
3279
+ },
3280
+ {
3281
+ "epoch": 0.35,
3282
+ "grad_norm": 0.337890625,
3283
+ "learning_rate": 0.0001651613536125049,
3284
+ "loss": 0.9803,
3285
+ "step": 2335
3286
+ },
3287
+ {
3288
+ "epoch": 0.35,
3289
+ "grad_norm": 0.34375,
3290
+ "learning_rate": 0.0001649645129298668,
3291
+ "loss": 0.9349,
3292
+ "step": 2340
3293
+ },
3294
+ {
3295
+ "epoch": 0.35,
3296
+ "grad_norm": 0.328125,
3297
+ "learning_rate": 0.0001647672357610973,
3298
+ "loss": 0.9581,
3299
+ "step": 2345
3300
+ },
3301
+ {
3302
+ "epoch": 0.35,
3303
+ "grad_norm": 0.33984375,
3304
+ "learning_rate": 0.00016456952343167007,
3305
+ "loss": 0.9243,
3306
+ "step": 2350
3307
+ },
3308
+ {
3309
+ "epoch": 0.35,
3310
+ "grad_norm": 0.34765625,
3311
+ "learning_rate": 0.00016437137726998255,
3312
+ "loss": 0.9591,
3313
+ "step": 2355
3314
+ },
3315
+ {
3316
+ "epoch": 0.35,
3317
+ "grad_norm": 0.34375,
3318
+ "learning_rate": 0.00016417279860734692,
3319
+ "loss": 0.9286,
3320
+ "step": 2360
3321
+ },
3322
+ {
3323
+ "epoch": 0.35,
3324
+ "grad_norm": 0.345703125,
3325
+ "learning_rate": 0.00016397378877798134,
3326
+ "loss": 0.9695,
3327
+ "step": 2365
3328
+ },
3329
+ {
3330
+ "epoch": 0.35,
3331
+ "grad_norm": 0.3515625,
3332
+ "learning_rate": 0.0001637743491190009,
3333
+ "loss": 0.9696,
3334
+ "step": 2370
3335
+ },
3336
+ {
3337
+ "epoch": 0.35,
3338
+ "grad_norm": 0.3359375,
3339
+ "learning_rate": 0.00016357448097040867,
3340
+ "loss": 0.9465,
3341
+ "step": 2375
3342
+ },
3343
+ {
3344
+ "epoch": 0.35,
3345
+ "grad_norm": 0.333984375,
3346
+ "learning_rate": 0.00016337418567508665,
3347
+ "loss": 0.9405,
3348
+ "step": 2380
3349
+ },
3350
+ {
3351
+ "epoch": 0.35,
3352
+ "grad_norm": 0.337890625,
3353
+ "learning_rate": 0.00016317346457878675,
3354
+ "loss": 0.9151,
3355
+ "step": 2385
3356
+ },
3357
+ {
3358
+ "epoch": 0.35,
3359
+ "grad_norm": 0.361328125,
3360
+ "learning_rate": 0.0001629723190301218,
3361
+ "loss": 0.9319,
3362
+ "step": 2390
3363
+ },
3364
+ {
3365
+ "epoch": 0.36,
3366
+ "grad_norm": 0.345703125,
3367
+ "learning_rate": 0.00016277075038055634,
3368
+ "loss": 0.956,
3369
+ "step": 2395
3370
+ },
3371
+ {
3372
+ "epoch": 0.36,
3373
+ "grad_norm": 0.3359375,
3374
+ "learning_rate": 0.00016256875998439796,
3375
+ "loss": 0.9475,
3376
+ "step": 2400
3377
+ },
3378
+ {
3379
+ "epoch": 0.36,
3380
+ "grad_norm": 0.3359375,
3381
+ "learning_rate": 0.00016236634919878756,
3382
+ "loss": 0.9747,
3383
+ "step": 2405
3384
+ },
3385
+ {
3386
+ "epoch": 0.36,
3387
+ "grad_norm": 0.3359375,
3388
+ "learning_rate": 0.00016216351938369073,
3389
+ "loss": 0.9405,
3390
+ "step": 2410
3391
+ },
3392
+ {
3393
+ "epoch": 0.36,
3394
+ "grad_norm": 0.341796875,
3395
+ "learning_rate": 0.00016196027190188848,
3396
+ "loss": 0.9888,
3397
+ "step": 2415
3398
+ },
3399
+ {
3400
+ "epoch": 0.36,
3401
+ "grad_norm": 0.34765625,
3402
+ "learning_rate": 0.00016175660811896796,
3403
+ "loss": 0.9063,
3404
+ "step": 2420
3405
+ },
3406
+ {
3407
+ "epoch": 0.36,
3408
+ "grad_norm": 0.345703125,
3409
+ "learning_rate": 0.00016155252940331342,
3410
+ "loss": 0.9454,
3411
+ "step": 2425
3412
+ },
3413
+ {
3414
+ "epoch": 0.36,
3415
+ "grad_norm": 0.341796875,
3416
+ "learning_rate": 0.000161348037126097,
3417
+ "loss": 0.9783,
3418
+ "step": 2430
3419
+ },
3420
+ {
3421
+ "epoch": 0.36,
3422
+ "grad_norm": 0.34375,
3423
+ "learning_rate": 0.0001611431326612695,
3424
+ "loss": 0.9951,
3425
+ "step": 2435
3426
+ },
3427
+ {
3428
+ "epoch": 0.36,
3429
+ "grad_norm": 0.33984375,
3430
+ "learning_rate": 0.00016093781738555106,
3431
+ "loss": 0.9194,
3432
+ "step": 2440
3433
+ },
3434
+ {
3435
+ "epoch": 0.36,
3436
+ "grad_norm": 0.345703125,
3437
+ "learning_rate": 0.0001607320926784221,
3438
+ "loss": 0.9692,
3439
+ "step": 2445
3440
+ },
3441
+ {
3442
+ "epoch": 0.36,
3443
+ "grad_norm": 0.349609375,
3444
+ "learning_rate": 0.00016052595992211387,
3445
+ "loss": 0.8929,
3446
+ "step": 2450
3447
+ },
3448
+ {
3449
+ "epoch": 0.36,
3450
+ "grad_norm": 0.345703125,
3451
+ "learning_rate": 0.0001603194205015993,
3452
+ "loss": 0.9456,
3453
+ "step": 2455
3454
+ },
3455
+ {
3456
+ "epoch": 0.37,
3457
+ "grad_norm": 0.337890625,
3458
+ "learning_rate": 0.00016011247580458355,
3459
+ "loss": 0.956,
3460
+ "step": 2460
3461
+ },
3462
+ {
3463
+ "epoch": 0.37,
3464
+ "grad_norm": 0.341796875,
3465
+ "learning_rate": 0.00015990512722149482,
3466
+ "loss": 0.9113,
3467
+ "step": 2465
3468
+ },
3469
+ {
3470
+ "epoch": 0.37,
3471
+ "grad_norm": 0.333984375,
3472
+ "learning_rate": 0.00015969737614547494,
3473
+ "loss": 0.9445,
3474
+ "step": 2470
3475
+ },
3476
+ {
3477
+ "epoch": 0.37,
3478
+ "grad_norm": 0.357421875,
3479
+ "learning_rate": 0.00015948922397237007,
3480
+ "loss": 0.957,
3481
+ "step": 2475
3482
+ },
3483
+ {
3484
+ "epoch": 0.37,
3485
+ "grad_norm": 0.349609375,
3486
+ "learning_rate": 0.00015928067210072122,
3487
+ "loss": 0.924,
3488
+ "step": 2480
3489
+ },
3490
+ {
3491
+ "epoch": 0.37,
3492
+ "grad_norm": 0.35546875,
3493
+ "learning_rate": 0.0001590717219317549,
3494
+ "loss": 0.9558,
3495
+ "step": 2485
3496
+ },
3497
+ {
3498
+ "epoch": 0.37,
3499
+ "grad_norm": 0.33984375,
3500
+ "learning_rate": 0.00015886237486937378,
3501
+ "loss": 0.9632,
3502
+ "step": 2490
3503
+ },
3504
+ {
3505
+ "epoch": 0.37,
3506
+ "grad_norm": 0.375,
3507
+ "learning_rate": 0.00015865263232014715,
3508
+ "loss": 0.9477,
3509
+ "step": 2495
3510
+ },
3511
+ {
3512
+ "epoch": 0.37,
3513
+ "grad_norm": 0.341796875,
3514
+ "learning_rate": 0.0001584424956933015,
3515
+ "loss": 0.9443,
3516
+ "step": 2500
3517
+ },
3518
+ {
3519
+ "epoch": 0.37,
3520
+ "grad_norm": 0.333984375,
3521
+ "learning_rate": 0.0001582319664007111,
3522
+ "loss": 0.9316,
3523
+ "step": 2505
3524
+ },
3525
+ {
3526
+ "epoch": 0.37,
3527
+ "grad_norm": 0.3359375,
3528
+ "learning_rate": 0.00015802104585688851,
3529
+ "loss": 0.9205,
3530
+ "step": 2510
3531
+ },
3532
+ {
3533
+ "epoch": 0.37,
3534
+ "grad_norm": 0.333984375,
3535
+ "learning_rate": 0.00015780973547897494,
3536
+ "loss": 0.924,
3537
+ "step": 2515
3538
+ },
3539
+ {
3540
+ "epoch": 0.37,
3541
+ "grad_norm": 0.333984375,
3542
+ "learning_rate": 0.0001575980366867309,
3543
+ "loss": 0.9498,
3544
+ "step": 2520
3545
+ },
3546
+ {
3547
+ "epoch": 0.37,
3548
+ "grad_norm": 0.34765625,
3549
+ "learning_rate": 0.00015738595090252657,
3550
+ "loss": 0.9677,
3551
+ "step": 2525
3552
+ },
3553
+ {
3554
+ "epoch": 0.38,
3555
+ "grad_norm": 0.353515625,
3556
+ "learning_rate": 0.00015717347955133233,
3557
+ "loss": 0.9786,
3558
+ "step": 2530
3559
+ },
3560
+ {
3561
+ "epoch": 0.38,
3562
+ "grad_norm": 0.345703125,
3563
+ "learning_rate": 0.00015696062406070902,
3564
+ "loss": 0.9523,
3565
+ "step": 2535
3566
+ },
3567
+ {
3568
+ "epoch": 0.38,
3569
+ "grad_norm": 0.3359375,
3570
+ "learning_rate": 0.00015674738586079852,
3571
+ "loss": 0.9496,
3572
+ "step": 2540
3573
+ },
3574
+ {
3575
+ "epoch": 0.38,
3576
+ "grad_norm": 0.349609375,
3577
+ "learning_rate": 0.0001565337663843141,
3578
+ "loss": 0.9439,
3579
+ "step": 2545
3580
+ },
3581
+ {
3582
+ "epoch": 0.38,
3583
+ "grad_norm": 0.33203125,
3584
+ "learning_rate": 0.00015631976706653063,
3585
+ "loss": 0.9496,
3586
+ "step": 2550
3587
+ },
3588
+ {
3589
+ "epoch": 0.38,
3590
+ "grad_norm": 0.330078125,
3591
+ "learning_rate": 0.00015610538934527526,
3592
+ "loss": 0.9315,
3593
+ "step": 2555
3594
+ },
3595
+ {
3596
+ "epoch": 0.38,
3597
+ "grad_norm": 0.345703125,
3598
+ "learning_rate": 0.00015589063466091743,
3599
+ "loss": 0.9595,
3600
+ "step": 2560
3601
+ },
3602
+ {
3603
+ "epoch": 0.38,
3604
+ "grad_norm": 0.333984375,
3605
+ "learning_rate": 0.0001556755044563594,
3606
+ "loss": 0.9524,
3607
+ "step": 2565
3608
+ },
3609
+ {
3610
+ "epoch": 0.38,
3611
+ "grad_norm": 0.341796875,
3612
+ "learning_rate": 0.00015546000017702648,
3613
+ "loss": 0.9442,
3614
+ "step": 2570
3615
+ },
3616
+ {
3617
+ "epoch": 0.38,
3618
+ "grad_norm": 0.357421875,
3619
+ "learning_rate": 0.00015524412327085725,
3620
+ "loss": 0.9807,
3621
+ "step": 2575
3622
+ },
3623
+ {
3624
+ "epoch": 0.38,
3625
+ "grad_norm": 0.318359375,
3626
+ "learning_rate": 0.00015502787518829406,
3627
+ "loss": 0.9385,
3628
+ "step": 2580
3629
+ },
3630
+ {
3631
+ "epoch": 0.38,
3632
+ "grad_norm": 0.328125,
3633
+ "learning_rate": 0.00015481125738227305,
3634
+ "loss": 0.9618,
3635
+ "step": 2585
3636
+ },
3637
+ {
3638
+ "epoch": 0.38,
3639
+ "grad_norm": 0.3515625,
3640
+ "learning_rate": 0.00015459427130821442,
3641
+ "loss": 0.9443,
3642
+ "step": 2590
3643
+ },
3644
+ {
3645
+ "epoch": 0.39,
3646
+ "grad_norm": 0.35546875,
3647
+ "learning_rate": 0.0001543769184240128,
3648
+ "loss": 0.9428,
3649
+ "step": 2595
3650
+ },
3651
+ {
3652
+ "epoch": 0.39,
3653
+ "grad_norm": 0.361328125,
3654
+ "learning_rate": 0.00015415920019002736,
3655
+ "loss": 0.9349,
3656
+ "step": 2600
3657
+ },
3658
+ {
3659
+ "epoch": 0.39,
3660
+ "grad_norm": 0.3359375,
3661
+ "learning_rate": 0.00015394111806907188,
3662
+ "loss": 0.9325,
3663
+ "step": 2605
3664
+ },
3665
+ {
3666
+ "epoch": 0.39,
3667
+ "grad_norm": 0.345703125,
3668
+ "learning_rate": 0.00015372267352640513,
3669
+ "loss": 0.9622,
3670
+ "step": 2610
3671
+ },
3672
+ {
3673
+ "epoch": 0.39,
3674
+ "grad_norm": 0.34765625,
3675
+ "learning_rate": 0.00015350386802972097,
3676
+ "loss": 0.934,
3677
+ "step": 2615
3678
+ },
3679
+ {
3680
+ "epoch": 0.39,
3681
+ "grad_norm": 0.349609375,
3682
+ "learning_rate": 0.00015328470304913833,
3683
+ "loss": 0.9729,
3684
+ "step": 2620
3685
+ },
3686
+ {
3687
+ "epoch": 0.39,
3688
+ "grad_norm": 0.330078125,
3689
+ "learning_rate": 0.00015306518005719157,
3690
+ "loss": 0.9473,
3691
+ "step": 2625
3692
+ },
3693
+ {
3694
+ "epoch": 0.39,
3695
+ "grad_norm": 0.3359375,
3696
+ "learning_rate": 0.00015284530052882045,
3697
+ "loss": 0.9268,
3698
+ "step": 2630
3699
+ },
3700
+ {
3701
+ "epoch": 0.39,
3702
+ "grad_norm": 0.34765625,
3703
+ "learning_rate": 0.00015262506594136016,
3704
+ "loss": 0.9737,
3705
+ "step": 2635
3706
+ },
3707
+ {
3708
+ "epoch": 0.39,
3709
+ "grad_norm": 0.34375,
3710
+ "learning_rate": 0.00015240447777453153,
3711
+ "loss": 0.9231,
3712
+ "step": 2640
3713
+ },
3714
+ {
3715
+ "epoch": 0.39,
3716
+ "grad_norm": 0.345703125,
3717
+ "learning_rate": 0.00015218353751043107,
3718
+ "loss": 0.9359,
3719
+ "step": 2645
3720
+ },
3721
+ {
3722
+ "epoch": 0.39,
3723
+ "grad_norm": 0.353515625,
3724
+ "learning_rate": 0.00015196224663352093,
3725
+ "loss": 0.9394,
3726
+ "step": 2650
3727
+ },
3728
+ {
3729
+ "epoch": 0.39,
3730
+ "grad_norm": 0.3671875,
3731
+ "learning_rate": 0.00015174060663061898,
3732
+ "loss": 0.9556,
3733
+ "step": 2655
3734
+ },
3735
+ {
3736
+ "epoch": 0.39,
3737
+ "grad_norm": 0.3359375,
3738
+ "learning_rate": 0.00015151861899088877,
3739
+ "loss": 0.9345,
3740
+ "step": 2660
3741
+ },
3742
+ {
3743
+ "epoch": 0.4,
3744
+ "grad_norm": 0.357421875,
3745
+ "learning_rate": 0.0001512962852058297,
3746
+ "loss": 0.9363,
3747
+ "step": 2665
3748
+ },
3749
+ {
3750
+ "epoch": 0.4,
3751
+ "grad_norm": 0.357421875,
3752
+ "learning_rate": 0.00015107360676926666,
3753
+ "loss": 0.9575,
3754
+ "step": 2670
3755
+ },
3756
+ {
3757
+ "epoch": 0.4,
3758
+ "grad_norm": 0.33984375,
3759
+ "learning_rate": 0.00015085058517734043,
3760
+ "loss": 0.9824,
3761
+ "step": 2675
3762
+ },
3763
+ {
3764
+ "epoch": 0.4,
3765
+ "grad_norm": 0.341796875,
3766
+ "learning_rate": 0.0001506272219284972,
3767
+ "loss": 0.95,
3768
+ "step": 2680
3769
+ },
3770
+ {
3771
+ "epoch": 0.4,
3772
+ "grad_norm": 0.33984375,
3773
+ "learning_rate": 0.00015040351852347878,
3774
+ "loss": 0.9348,
3775
+ "step": 2685
3776
+ },
3777
+ {
3778
+ "epoch": 0.4,
3779
+ "grad_norm": 0.34765625,
3780
+ "learning_rate": 0.0001501794764653124,
3781
+ "loss": 0.9585,
3782
+ "step": 2690
3783
+ },
3784
+ {
3785
+ "epoch": 0.4,
3786
+ "grad_norm": 0.345703125,
3787
+ "learning_rate": 0.00014995509725930078,
3788
+ "loss": 0.9342,
3789
+ "step": 2695
3790
+ },
3791
+ {
3792
+ "epoch": 0.4,
3793
+ "grad_norm": 0.34375,
3794
+ "learning_rate": 0.0001497303824130117,
3795
+ "loss": 0.9266,
3796
+ "step": 2700
3797
+ },
3798
+ {
3799
+ "epoch": 0.4,
3800
+ "grad_norm": 0.33984375,
3801
+ "learning_rate": 0.00014950533343626812,
3802
+ "loss": 0.9552,
3803
+ "step": 2705
3804
+ },
3805
+ {
3806
+ "epoch": 0.4,
3807
+ "grad_norm": 0.33984375,
3808
+ "learning_rate": 0.000149279951841138,
3809
+ "loss": 0.9514,
3810
+ "step": 2710
3811
+ },
3812
+ {
3813
+ "epoch": 0.4,
3814
+ "grad_norm": 0.341796875,
3815
+ "learning_rate": 0.00014905423914192412,
3816
+ "loss": 0.9697,
3817
+ "step": 2715
3818
+ },
3819
+ {
3820
+ "epoch": 0.4,
3821
+ "grad_norm": 0.34375,
3822
+ "learning_rate": 0.0001488281968551538,
3823
+ "loss": 0.9494,
3824
+ "step": 2720
3825
+ },
3826
+ {
3827
+ "epoch": 0.4,
3828
+ "grad_norm": 0.357421875,
3829
+ "learning_rate": 0.00014860182649956892,
3830
+ "loss": 0.9667,
3831
+ "step": 2725
3832
+ },
3833
+ {
3834
+ "epoch": 0.41,
3835
+ "grad_norm": 0.35546875,
3836
+ "learning_rate": 0.0001483751295961156,
3837
+ "loss": 0.9504,
3838
+ "step": 2730
3839
+ },
3840
+ {
3841
+ "epoch": 0.41,
3842
+ "grad_norm": 0.3515625,
3843
+ "learning_rate": 0.0001481481076679338,
3844
+ "loss": 0.953,
3845
+ "step": 2735
3846
+ },
3847
+ {
3848
+ "epoch": 0.41,
3849
+ "grad_norm": 0.34375,
3850
+ "learning_rate": 0.00014792076224034753,
3851
+ "loss": 0.9548,
3852
+ "step": 2740
3853
+ },
3854
+ {
3855
+ "epoch": 0.41,
3856
+ "grad_norm": 0.34375,
3857
+ "learning_rate": 0.00014769309484085412,
3858
+ "loss": 0.9632,
3859
+ "step": 2745
3860
+ },
3861
+ {
3862
+ "epoch": 0.41,
3863
+ "grad_norm": 0.33984375,
3864
+ "learning_rate": 0.00014746510699911432,
3865
+ "loss": 0.9619,
3866
+ "step": 2750
3867
+ },
3868
+ {
3869
+ "epoch": 0.41,
3870
+ "grad_norm": 0.34375,
3871
+ "learning_rate": 0.00014723680024694184,
3872
+ "loss": 0.9155,
3873
+ "step": 2755
3874
+ },
3875
+ {
3876
+ "epoch": 0.41,
3877
+ "grad_norm": 0.353515625,
3878
+ "learning_rate": 0.00014700817611829308,
3879
+ "loss": 0.9234,
3880
+ "step": 2760
3881
+ },
3882
+ {
3883
+ "epoch": 0.41,
3884
+ "grad_norm": 0.337890625,
3885
+ "learning_rate": 0.00014677923614925685,
3886
+ "loss": 0.934,
3887
+ "step": 2765
3888
+ },
3889
+ {
3890
+ "epoch": 0.41,
3891
+ "grad_norm": 0.3359375,
3892
+ "learning_rate": 0.000146549981878044,
3893
+ "loss": 0.9354,
3894
+ "step": 2770
3895
+ },
3896
+ {
3897
+ "epoch": 0.41,
3898
+ "grad_norm": 0.333984375,
3899
+ "learning_rate": 0.00014632041484497727,
3900
+ "loss": 0.9554,
3901
+ "step": 2775
3902
+ },
3903
+ {
3904
+ "epoch": 0.41,
3905
+ "grad_norm": 0.353515625,
3906
+ "learning_rate": 0.00014609053659248058,
3907
+ "loss": 0.9443,
3908
+ "step": 2780
3909
+ },
3910
+ {
3911
+ "epoch": 0.41,
3912
+ "grad_norm": 0.341796875,
3913
+ "learning_rate": 0.00014586034866506906,
3914
+ "loss": 0.9584,
3915
+ "step": 2785
3916
+ },
3917
+ {
3918
+ "epoch": 0.41,
3919
+ "grad_norm": 0.34375,
3920
+ "learning_rate": 0.00014562985260933845,
3921
+ "loss": 0.9035,
3922
+ "step": 2790
3923
+ },
3924
+ {
3925
+ "epoch": 0.42,
3926
+ "grad_norm": 0.34765625,
3927
+ "learning_rate": 0.00014539904997395468,
3928
+ "loss": 0.9215,
3929
+ "step": 2795
3930
+ },
3931
+ {
3932
+ "epoch": 0.42,
3933
+ "grad_norm": 0.341796875,
3934
+ "learning_rate": 0.00014516794230964365,
3935
+ "loss": 0.9279,
3936
+ "step": 2800
3937
+ },
3938
+ {
3939
+ "epoch": 0.42,
3940
+ "grad_norm": 0.349609375,
3941
+ "learning_rate": 0.00014493653116918066,
3942
+ "loss": 0.9395,
3943
+ "step": 2805
3944
+ },
3945
+ {
3946
+ "epoch": 0.42,
3947
+ "grad_norm": 0.341796875,
3948
+ "learning_rate": 0.0001447048181073799,
3949
+ "loss": 0.9314,
3950
+ "step": 2810
3951
+ },
3952
+ {
3953
+ "epoch": 0.42,
3954
+ "grad_norm": 0.3359375,
3955
+ "learning_rate": 0.00014447280468108436,
3956
+ "loss": 0.9497,
3957
+ "step": 2815
3958
+ },
3959
+ {
3960
+ "epoch": 0.42,
3961
+ "grad_norm": 0.345703125,
3962
+ "learning_rate": 0.00014424049244915493,
3963
+ "loss": 0.9324,
3964
+ "step": 2820
3965
+ },
3966
+ {
3967
+ "epoch": 0.42,
3968
+ "grad_norm": 0.3515625,
3969
+ "learning_rate": 0.00014400788297246024,
3970
+ "loss": 0.9351,
3971
+ "step": 2825
3972
+ },
3973
+ {
3974
+ "epoch": 0.42,
3975
+ "grad_norm": 0.345703125,
3976
+ "learning_rate": 0.0001437749778138659,
3977
+ "loss": 0.9482,
3978
+ "step": 2830
3979
+ },
3980
+ {
3981
+ "epoch": 0.42,
3982
+ "grad_norm": 0.34375,
3983
+ "learning_rate": 0.00014354177853822443,
3984
+ "loss": 0.938,
3985
+ "step": 2835
3986
+ },
3987
+ {
3988
+ "epoch": 0.42,
3989
+ "grad_norm": 0.326171875,
3990
+ "learning_rate": 0.00014330828671236425,
3991
+ "loss": 0.9478,
3992
+ "step": 2840
3993
+ },
3994
+ {
3995
+ "epoch": 0.42,
3996
+ "grad_norm": 0.33984375,
3997
+ "learning_rate": 0.0001430745039050794,
3998
+ "loss": 0.9596,
3999
+ "step": 2845
4000
+ },
4001
+ {
4002
+ "epoch": 0.42,
4003
+ "grad_norm": 0.34765625,
4004
+ "learning_rate": 0.00014284043168711906,
4005
+ "loss": 0.9186,
4006
+ "step": 2850
4007
+ },
4008
+ {
4009
+ "epoch": 0.42,
4010
+ "grad_norm": 0.337890625,
4011
+ "learning_rate": 0.00014260607163117694,
4012
+ "loss": 0.9631,
4013
+ "step": 2855
4014
+ },
4015
+ {
4016
+ "epoch": 0.42,
4017
+ "grad_norm": 0.341796875,
4018
+ "learning_rate": 0.00014237142531188055,
4019
+ "loss": 0.9411,
4020
+ "step": 2860
4021
+ },
4022
+ {
4023
+ "epoch": 0.43,
4024
+ "grad_norm": 0.357421875,
4025
+ "learning_rate": 0.00014213649430578083,
4026
+ "loss": 0.9558,
4027
+ "step": 2865
4028
+ },
4029
+ {
4030
+ "epoch": 0.43,
4031
+ "grad_norm": 0.341796875,
4032
+ "learning_rate": 0.00014190128019134153,
4033
+ "loss": 0.9684,
4034
+ "step": 2870
4035
+ },
4036
+ {
4037
+ "epoch": 0.43,
4038
+ "grad_norm": 0.33984375,
4039
+ "learning_rate": 0.00014166578454892853,
4040
+ "loss": 0.9226,
4041
+ "step": 2875
4042
+ },
4043
+ {
4044
+ "epoch": 0.43,
4045
+ "grad_norm": 0.341796875,
4046
+ "learning_rate": 0.00014143000896079918,
4047
+ "loss": 0.9103,
4048
+ "step": 2880
4049
+ },
4050
+ {
4051
+ "epoch": 0.43,
4052
+ "grad_norm": 0.337890625,
4053
+ "learning_rate": 0.00014119395501109182,
4054
+ "loss": 0.9262,
4055
+ "step": 2885
4056
+ },
4057
+ {
4058
+ "epoch": 0.43,
4059
+ "grad_norm": 0.349609375,
4060
+ "learning_rate": 0.00014095762428581506,
4061
+ "loss": 0.9444,
4062
+ "step": 2890
4063
+ },
4064
+ {
4065
+ "epoch": 0.43,
4066
+ "grad_norm": 0.349609375,
4067
+ "learning_rate": 0.0001407210183728371,
4068
+ "loss": 0.9438,
4069
+ "step": 2895
4070
+ },
4071
+ {
4072
+ "epoch": 0.43,
4073
+ "grad_norm": 0.357421875,
4074
+ "learning_rate": 0.00014048413886187503,
4075
+ "loss": 0.9699,
4076
+ "step": 2900
4077
+ },
4078
+ {
4079
+ "epoch": 0.43,
4080
+ "grad_norm": 0.33203125,
4081
+ "learning_rate": 0.00014024698734448431,
4082
+ "loss": 0.9255,
4083
+ "step": 2905
4084
+ },
4085
+ {
4086
+ "epoch": 0.43,
4087
+ "grad_norm": 0.361328125,
4088
+ "learning_rate": 0.00014000956541404785,
4089
+ "loss": 0.9733,
4090
+ "step": 2910
4091
+ },
4092
+ {
4093
+ "epoch": 0.43,
4094
+ "grad_norm": 0.353515625,
4095
+ "learning_rate": 0.0001397718746657655,
4096
+ "loss": 0.9554,
4097
+ "step": 2915
4098
+ },
4099
+ {
4100
+ "epoch": 0.43,
4101
+ "grad_norm": 0.34765625,
4102
+ "learning_rate": 0.0001395339166966433,
4103
+ "loss": 0.9273,
4104
+ "step": 2920
4105
+ },
4106
+ {
4107
+ "epoch": 0.43,
4108
+ "grad_norm": 0.34375,
4109
+ "learning_rate": 0.0001392956931054825,
4110
+ "loss": 0.9364,
4111
+ "step": 2925
4112
+ },
4113
+ {
4114
+ "epoch": 0.44,
4115
+ "grad_norm": 0.3359375,
4116
+ "learning_rate": 0.00013905720549286932,
4117
+ "loss": 0.9433,
4118
+ "step": 2930
4119
+ },
4120
+ {
4121
+ "epoch": 0.44,
4122
+ "grad_norm": 0.341796875,
4123
+ "learning_rate": 0.0001388184554611636,
4124
+ "loss": 0.9288,
4125
+ "step": 2935
4126
+ },
4127
+ {
4128
+ "epoch": 0.44,
4129
+ "grad_norm": 0.34375,
4130
+ "learning_rate": 0.0001385794446144885,
4131
+ "loss": 0.9338,
4132
+ "step": 2940
4133
+ },
4134
+ {
4135
+ "epoch": 0.44,
4136
+ "grad_norm": 0.34765625,
4137
+ "learning_rate": 0.0001383401745587196,
4138
+ "loss": 1.0093,
4139
+ "step": 2945
4140
+ },
4141
+ {
4142
+ "epoch": 0.44,
4143
+ "grad_norm": 0.337890625,
4144
+ "learning_rate": 0.00013810064690147387,
4145
+ "loss": 0.924,
4146
+ "step": 2950
4147
+ },
4148
+ {
4149
+ "epoch": 0.44,
4150
+ "grad_norm": 0.345703125,
4151
+ "learning_rate": 0.0001378608632520993,
4152
+ "loss": 0.9731,
4153
+ "step": 2955
4154
+ },
4155
+ {
4156
+ "epoch": 0.44,
4157
+ "grad_norm": 0.341796875,
4158
+ "learning_rate": 0.00013762082522166363,
4159
+ "loss": 0.9236,
4160
+ "step": 2960
4161
+ },
4162
+ {
4163
+ "epoch": 0.44,
4164
+ "grad_norm": 0.33984375,
4165
+ "learning_rate": 0.0001373805344229439,
4166
+ "loss": 0.9378,
4167
+ "step": 2965
4168
+ },
4169
+ {
4170
+ "epoch": 0.44,
4171
+ "grad_norm": 0.349609375,
4172
+ "learning_rate": 0.00013713999247041533,
4173
+ "loss": 0.929,
4174
+ "step": 2970
4175
+ },
4176
+ {
4177
+ "epoch": 0.44,
4178
+ "grad_norm": 0.353515625,
4179
+ "learning_rate": 0.00013689920098024078,
4180
+ "loss": 0.9338,
4181
+ "step": 2975
4182
+ },
4183
+ {
4184
+ "epoch": 0.44,
4185
+ "grad_norm": 0.34765625,
4186
+ "learning_rate": 0.0001366581615702596,
4187
+ "loss": 0.9437,
4188
+ "step": 2980
4189
+ },
4190
+ {
4191
+ "epoch": 0.44,
4192
+ "grad_norm": 0.345703125,
4193
+ "learning_rate": 0.00013641687585997677,
4194
+ "loss": 0.9331,
4195
+ "step": 2985
4196
+ },
4197
+ {
4198
+ "epoch": 0.44,
4199
+ "grad_norm": 0.34765625,
4200
+ "learning_rate": 0.00013617534547055236,
4201
+ "loss": 0.9655,
4202
+ "step": 2990
4203
+ },
4204
+ {
4205
+ "epoch": 0.44,
4206
+ "grad_norm": 0.341796875,
4207
+ "learning_rate": 0.0001359335720247902,
4208
+ "loss": 0.9384,
4209
+ "step": 2995
4210
+ },
4211
+ {
4212
+ "epoch": 0.45,
4213
+ "grad_norm": 0.3515625,
4214
+ "learning_rate": 0.0001356915571471273,
4215
+ "loss": 0.9497,
4216
+ "step": 3000
4217
+ },
4218
+ {
4219
+ "epoch": 0.45,
4220
+ "grad_norm": 0.353515625,
4221
+ "learning_rate": 0.0001354493024636227,
4222
+ "loss": 0.934,
4223
+ "step": 3005
4224
+ },
4225
+ {
4226
+ "epoch": 0.45,
4227
+ "grad_norm": 0.345703125,
4228
+ "learning_rate": 0.0001352068096019468,
4229
+ "loss": 0.9272,
4230
+ "step": 3010
4231
+ },
4232
+ {
4233
+ "epoch": 0.45,
4234
+ "grad_norm": 0.341796875,
4235
+ "learning_rate": 0.00013496408019137018,
4236
+ "loss": 0.9428,
4237
+ "step": 3015
4238
+ },
4239
+ {
4240
+ "epoch": 0.45,
4241
+ "grad_norm": 0.33203125,
4242
+ "learning_rate": 0.00013472111586275274,
4243
+ "loss": 0.9461,
4244
+ "step": 3020
4245
+ },
4246
+ {
4247
+ "epoch": 0.45,
4248
+ "grad_norm": 0.341796875,
4249
+ "learning_rate": 0.0001344779182485328,
4250
+ "loss": 0.9403,
4251
+ "step": 3025
4252
+ },
4253
+ {
4254
+ "epoch": 0.45,
4255
+ "grad_norm": 0.34375,
4256
+ "learning_rate": 0.0001342344889827161,
4257
+ "loss": 0.9324,
4258
+ "step": 3030
4259
+ },
4260
+ {
4261
+ "epoch": 0.45,
4262
+ "grad_norm": 0.345703125,
4263
+ "learning_rate": 0.0001339908297008648,
4264
+ "loss": 0.9427,
4265
+ "step": 3035
4266
+ },
4267
+ {
4268
+ "epoch": 0.45,
4269
+ "grad_norm": 0.345703125,
4270
+ "learning_rate": 0.00013374694204008647,
4271
+ "loss": 0.9369,
4272
+ "step": 3040
4273
+ },
4274
+ {
4275
+ "epoch": 0.45,
4276
+ "grad_norm": 0.33984375,
4277
+ "learning_rate": 0.00013350282763902315,
4278
+ "loss": 0.9251,
4279
+ "step": 3045
4280
+ },
4281
+ {
4282
+ "epoch": 0.45,
4283
+ "grad_norm": 0.33984375,
4284
+ "learning_rate": 0.0001332584881378403,
4285
+ "loss": 0.9365,
4286
+ "step": 3050
4287
+ },
4288
+ {
4289
+ "epoch": 0.45,
4290
+ "grad_norm": 0.345703125,
4291
+ "learning_rate": 0.00013301392517821577,
4292
+ "loss": 0.9184,
4293
+ "step": 3055
4294
+ },
4295
+ {
4296
+ "epoch": 0.45,
4297
+ "grad_norm": 0.3359375,
4298
+ "learning_rate": 0.00013276914040332889,
4299
+ "loss": 0.9231,
4300
+ "step": 3060
4301
+ },
4302
+ {
4303
+ "epoch": 0.46,
4304
+ "grad_norm": 0.349609375,
4305
+ "learning_rate": 0.0001325241354578492,
4306
+ "loss": 0.9217,
4307
+ "step": 3065
4308
+ },
4309
+ {
4310
+ "epoch": 0.46,
4311
+ "grad_norm": 0.341796875,
4312
+ "learning_rate": 0.0001322789119879256,
4313
+ "loss": 0.9712,
4314
+ "step": 3070
4315
+ },
4316
+ {
4317
+ "epoch": 0.46,
4318
+ "grad_norm": 0.3515625,
4319
+ "learning_rate": 0.00013203347164117524,
4320
+ "loss": 0.9239,
4321
+ "step": 3075
4322
+ },
4323
+ {
4324
+ "epoch": 0.46,
4325
+ "grad_norm": 0.341796875,
4326
+ "learning_rate": 0.00013178781606667234,
4327
+ "loss": 0.9567,
4328
+ "step": 3080
4329
+ },
4330
+ {
4331
+ "epoch": 0.46,
4332
+ "grad_norm": 0.34375,
4333
+ "learning_rate": 0.00013154194691493732,
4334
+ "loss": 0.9389,
4335
+ "step": 3085
4336
+ },
4337
+ {
4338
+ "epoch": 0.46,
4339
+ "grad_norm": 0.337890625,
4340
+ "learning_rate": 0.0001312958658379255,
4341
+ "loss": 0.9429,
4342
+ "step": 3090
4343
+ },
4344
+ {
4345
+ "epoch": 0.46,
4346
+ "grad_norm": 0.341796875,
4347
+ "learning_rate": 0.00013104957448901614,
4348
+ "loss": 0.9226,
4349
+ "step": 3095
4350
+ },
4351
+ {
4352
+ "epoch": 0.46,
4353
+ "grad_norm": 0.34375,
4354
+ "learning_rate": 0.00013080307452300127,
4355
+ "loss": 0.9155,
4356
+ "step": 3100
4357
+ },
4358
+ {
4359
+ "epoch": 0.46,
4360
+ "grad_norm": 0.34765625,
4361
+ "learning_rate": 0.00013055636759607458,
4362
+ "loss": 0.9516,
4363
+ "step": 3105
4364
+ },
4365
+ {
4366
+ "epoch": 0.46,
4367
+ "grad_norm": 0.337890625,
4368
+ "learning_rate": 0.00013030945536582025,
4369
+ "loss": 0.9284,
4370
+ "step": 3110
4371
+ },
4372
+ {
4373
+ "epoch": 0.46,
4374
+ "grad_norm": 0.33984375,
4375
+ "learning_rate": 0.00013006233949120199,
4376
+ "loss": 0.9113,
4377
+ "step": 3115
4378
+ },
4379
+ {
4380
+ "epoch": 0.46,
4381
+ "grad_norm": 0.341796875,
4382
+ "learning_rate": 0.00012981502163255166,
4383
+ "loss": 0.9471,
4384
+ "step": 3120
4385
+ },
4386
+ {
4387
+ "epoch": 0.46,
4388
+ "grad_norm": 0.34765625,
4389
+ "learning_rate": 0.0001295675034515582,
4390
+ "loss": 0.9416,
4391
+ "step": 3125
4392
+ },
4393
+ {
4394
+ "epoch": 0.46,
4395
+ "grad_norm": 0.359375,
4396
+ "learning_rate": 0.00012931978661125655,
4397
+ "loss": 0.9395,
4398
+ "step": 3130
4399
+ },
4400
+ {
4401
+ "epoch": 0.47,
4402
+ "grad_norm": 0.361328125,
4403
+ "learning_rate": 0.00012907187277601641,
4404
+ "loss": 0.9367,
4405
+ "step": 3135
4406
+ },
4407
+ {
4408
+ "epoch": 0.47,
4409
+ "grad_norm": 0.35546875,
4410
+ "learning_rate": 0.00012882376361153102,
4411
+ "loss": 0.923,
4412
+ "step": 3140
4413
+ },
4414
+ {
4415
+ "epoch": 0.47,
4416
+ "grad_norm": 0.3515625,
4417
+ "learning_rate": 0.00012857546078480598,
4418
+ "loss": 0.9115,
4419
+ "step": 3145
4420
+ },
4421
+ {
4422
+ "epoch": 0.47,
4423
+ "grad_norm": 0.345703125,
4424
+ "learning_rate": 0.00012832696596414817,
4425
+ "loss": 0.9182,
4426
+ "step": 3150
4427
+ },
4428
+ {
4429
+ "epoch": 0.47,
4430
+ "grad_norm": 0.3359375,
4431
+ "learning_rate": 0.00012807828081915436,
4432
+ "loss": 0.913,
4433
+ "step": 3155
4434
+ },
4435
+ {
4436
+ "epoch": 0.47,
4437
+ "grad_norm": 0.34765625,
4438
+ "learning_rate": 0.0001278294070207001,
4439
+ "loss": 0.9438,
4440
+ "step": 3160
4441
+ },
4442
+ {
4443
+ "epoch": 0.47,
4444
+ "grad_norm": 0.35546875,
4445
+ "learning_rate": 0.0001275803462409285,
4446
+ "loss": 0.9487,
4447
+ "step": 3165
4448
+ },
4449
+ {
4450
+ "epoch": 0.47,
4451
+ "grad_norm": 0.349609375,
4452
+ "learning_rate": 0.00012733110015323898,
4453
+ "loss": 0.9521,
4454
+ "step": 3170
4455
+ },
4456
+ {
4457
+ "epoch": 0.47,
4458
+ "grad_norm": 0.333984375,
4459
+ "learning_rate": 0.0001270816704322759,
4460
+ "loss": 0.9146,
4461
+ "step": 3175
4462
+ },
4463
+ {
4464
+ "epoch": 0.47,
4465
+ "grad_norm": 0.34765625,
4466
+ "learning_rate": 0.00012683205875391754,
4467
+ "loss": 0.9211,
4468
+ "step": 3180
4469
+ },
4470
+ {
4471
+ "epoch": 0.47,
4472
+ "grad_norm": 0.3515625,
4473
+ "learning_rate": 0.00012658226679526476,
4474
+ "loss": 0.9297,
4475
+ "step": 3185
4476
+ },
4477
+ {
4478
+ "epoch": 0.47,
4479
+ "grad_norm": 0.34375,
4480
+ "learning_rate": 0.00012633229623462951,
4481
+ "loss": 0.9186,
4482
+ "step": 3190
4483
+ },
4484
+ {
4485
+ "epoch": 0.47,
4486
+ "grad_norm": 0.35546875,
4487
+ "learning_rate": 0.00012608214875152392,
4488
+ "loss": 0.9443,
4489
+ "step": 3195
4490
+ },
4491
+ {
4492
+ "epoch": 0.48,
4493
+ "grad_norm": 0.341796875,
4494
+ "learning_rate": 0.00012583182602664877,
4495
+ "loss": 0.9399,
4496
+ "step": 3200
4497
+ },
4498
+ {
4499
+ "epoch": 0.48,
4500
+ "grad_norm": 0.337890625,
4501
+ "learning_rate": 0.00012558132974188223,
4502
+ "loss": 0.9342,
4503
+ "step": 3205
4504
+ },
4505
+ {
4506
+ "epoch": 0.48,
4507
+ "grad_norm": 0.330078125,
4508
+ "learning_rate": 0.00012533066158026862,
4509
+ "loss": 0.9088,
4510
+ "step": 3210
4511
+ },
4512
+ {
4513
+ "epoch": 0.48,
4514
+ "grad_norm": 0.3515625,
4515
+ "learning_rate": 0.00012507982322600703,
4516
+ "loss": 0.9777,
4517
+ "step": 3215
4518
+ },
4519
+ {
4520
+ "epoch": 0.48,
4521
+ "grad_norm": 0.33984375,
4522
+ "learning_rate": 0.00012482881636444014,
4523
+ "loss": 0.944,
4524
+ "step": 3220
4525
+ },
4526
+ {
4527
+ "epoch": 0.48,
4528
+ "grad_norm": 0.33203125,
4529
+ "learning_rate": 0.00012457764268204277,
4530
+ "loss": 0.9307,
4531
+ "step": 3225
4532
+ },
4533
+ {
4534
+ "epoch": 0.48,
4535
+ "grad_norm": 0.34375,
4536
+ "learning_rate": 0.0001243263038664105,
4537
+ "loss": 0.9396,
4538
+ "step": 3230
4539
+ },
4540
+ {
4541
+ "epoch": 0.48,
4542
+ "grad_norm": 0.34375,
4543
+ "learning_rate": 0.00012407480160624848,
4544
+ "loss": 0.9138,
4545
+ "step": 3235
4546
+ },
4547
+ {
4548
+ "epoch": 0.48,
4549
+ "grad_norm": 0.345703125,
4550
+ "learning_rate": 0.0001238231375913601,
4551
+ "loss": 0.9583,
4552
+ "step": 3240
4553
+ },
4554
+ {
4555
+ "epoch": 0.48,
4556
+ "grad_norm": 0.3515625,
4557
+ "learning_rate": 0.00012357131351263537,
4558
+ "loss": 0.9799,
4559
+ "step": 3245
4560
+ },
4561
+ {
4562
+ "epoch": 0.48,
4563
+ "grad_norm": 0.35546875,
4564
+ "learning_rate": 0.00012331933106203986,
4565
+ "loss": 0.9619,
4566
+ "step": 3250
4567
+ },
4568
+ {
4569
+ "epoch": 0.48,
4570
+ "grad_norm": 0.349609375,
4571
+ "learning_rate": 0.00012306719193260323,
4572
+ "loss": 0.9188,
4573
+ "step": 3255
4574
+ },
4575
+ {
4576
+ "epoch": 0.48,
4577
+ "grad_norm": 0.35546875,
4578
+ "learning_rate": 0.00012281489781840781,
4579
+ "loss": 0.9593,
4580
+ "step": 3260
4581
+ },
4582
+ {
4583
+ "epoch": 0.48,
4584
+ "grad_norm": 0.349609375,
4585
+ "learning_rate": 0.0001225624504145772,
4586
+ "loss": 0.938,
4587
+ "step": 3265
4588
+ },
4589
+ {
4590
+ "epoch": 0.49,
4591
+ "grad_norm": 0.349609375,
4592
+ "learning_rate": 0.00012230985141726498,
4593
+ "loss": 0.9237,
4594
+ "step": 3270
4595
+ },
4596
+ {
4597
+ "epoch": 0.49,
4598
+ "grad_norm": 0.357421875,
4599
+ "learning_rate": 0.00012205710252364329,
4600
+ "loss": 0.9314,
4601
+ "step": 3275
4602
+ },
4603
+ {
4604
+ "epoch": 0.49,
4605
+ "grad_norm": 0.34765625,
4606
+ "learning_rate": 0.00012180420543189131,
4607
+ "loss": 0.9093,
4608
+ "step": 3280
4609
+ },
4610
+ {
4611
+ "epoch": 0.49,
4612
+ "grad_norm": 0.349609375,
4613
+ "learning_rate": 0.00012155116184118402,
4614
+ "loss": 0.9335,
4615
+ "step": 3285
4616
+ },
4617
+ {
4618
+ "epoch": 0.49,
4619
+ "grad_norm": 0.341796875,
4620
+ "learning_rate": 0.00012129797345168073,
4621
+ "loss": 0.9643,
4622
+ "step": 3290
4623
+ },
4624
+ {
4625
+ "epoch": 0.49,
4626
+ "grad_norm": 0.341796875,
4627
+ "learning_rate": 0.00012104464196451353,
4628
+ "loss": 0.9507,
4629
+ "step": 3295
4630
+ },
4631
+ {
4632
+ "epoch": 0.49,
4633
+ "grad_norm": 0.34375,
4634
+ "learning_rate": 0.00012079116908177593,
4635
+ "loss": 0.9201,
4636
+ "step": 3300
4637
  }
4638
  ],
4639
  "logging_steps": 5,
 
4641
  "num_input_tokens_seen": 0,
4642
  "num_train_epochs": 1,
4643
  "save_steps": 100,
4644
+ "total_flos": 4.640654868195836e+18,
4645
  "train_batch_size": 2,
4646
  "trial_name": null,
4647
  "trial_params": null