evanfrick commited on
Commit
01f0e4f
1 Parent(s): b2821de
Files changed (1) hide show
  1. results.json +0 -148
results.json CHANGED
@@ -5785,43 +5785,6 @@
5785
  "accuracy": 0.5723270440251572
5786
  }
5787
  },
5788
- "data/ifeval_best_of_k/Qwen2.5-7B-RM-09192024.json": {
5789
- "all": {
5790
- "accuracy": 0.5726562500000001,
5791
- "area_under_curve": 0.5536947485942088,
5792
- "loss": 0.1320898115634918,
5793
- "mean_max_score": 0.58177734375,
5794
- "mean_end_score": 0.565234375
5795
- },
5796
- "gemma-2-9b-it": {
5797
- "accuracy": 0.5796875,
5798
- "area_under_curve": 0.5556161705870118,
5799
- "loss": 0.11947629928588867,
5800
- "mean_max_score": 0.62125,
5801
- "mean_end_score": 0.58953125
5802
- },
5803
- "gpt-4o-mini-2024-07-18": {
5804
- "accuracy": 0.61875,
5805
- "area_under_curve": 0.5427620686093066,
5806
- "loss": 0.1205466079711914,
5807
- "mean_max_score": 0.631328125,
5808
- "mean_end_score": 0.605390625
5809
- },
5810
- "Meta-Llama-3-8B-Instruct": {
5811
- "accuracy": 0.58125,
5812
- "area_under_curve": 0.5553819052297072,
5813
- "loss": 0.14538890838623048,
5814
- "mean_max_score": 0.60421875,
5815
- "mean_end_score": 0.55078125
5816
- },
5817
- "claude-3-haiku-20240307": {
5818
- "accuracy": 0.5109375,
5819
- "area_under_curve": 0.5500825653737205,
5820
- "loss": 0.15359460830688476,
5821
- "mean_max_score": 0.54703125,
5822
- "mean_end_score": 0.515625
5823
- }
5824
- },
5825
  "data/ifeval_best_of_k/internlm2-1_8b-reward.json": {
5826
  "all": {
5827
  "accuracy": 0.5386718749999999,
@@ -6189,43 +6152,6 @@
6189
  "mean_end_score": 0.606015625
6190
  }
6191
  },
6192
- "data/ifeval_best_of_k/Qwen2.5-72B-RM-09242024.json": {
6193
- "all": {
6194
- "accuracy": 0.591796875,
6195
- "area_under_curve": 0.5682828198209156,
6196
- "loss": 0.11081918239593506,
6197
- "mean_max_score": 0.61642578125,
6198
- "mean_end_score": 0.61126953125
6199
- },
6200
- "gemma-2-9b-it": {
6201
- "accuracy": 0.615625,
6202
- "area_under_curve": 0.5724099728839697,
6203
- "loss": 0.08845264434814454,
6204
- "mean_max_score": 0.667265625,
6205
- "mean_end_score": 0.6484375
6206
- },
6207
- "gpt-4o-mini-2024-07-18": {
6208
- "accuracy": 0.6046875,
6209
- "area_under_curve": 0.5579399685462639,
6210
- "loss": 0.1147395133972168,
6211
- "mean_max_score": 0.644296875,
6212
- "mean_end_score": 0.6171875
6213
- },
6214
- "Meta-Llama-3-8B-Instruct": {
6215
- "accuracy": 0.6140625,
6216
- "area_under_curve": 0.576997247648311,
6217
- "loss": 0.11122642517089844,
6218
- "mean_max_score": 0.64453125,
6219
- "mean_end_score": 0.636640625
6220
- },
6221
- "claude-3-haiku-20240307": {
6222
- "accuracy": 0.5328125,
6223
- "area_under_curve": 0.5592622087343524,
6224
- "loss": 0.13447074890136718,
6225
- "mean_max_score": 0.5696875,
6226
- "mean_end_score": 0.543359375
6227
- }
6228
- },
6229
  "data/ifeval_best_of_k/nemotron-4-340b-reward.json": {
6230
  "all": {
6231
  "accuracy": 0.6265624999999999,
@@ -6300,43 +6226,6 @@
6300
  "mean_end_score": 0.578125
6301
  }
6302
  },
6303
- "data/ifeval_best_of_k/Llama-3.1-8B-Instruct-RM-Test.json": {
6304
- "all": {
6305
- "accuracy": 0.5953124999999999,
6306
- "area_under_curve": 0.5659010925349728,
6307
- "loss": 0.11261327266693115,
6308
- "mean_max_score": 0.61439453125,
6309
- "mean_end_score": 0.60623046875
6310
- },
6311
- "gemma-2-9b-it": {
6312
- "accuracy": 0.6,
6313
- "area_under_curve": 0.5742011950437376,
6314
- "loss": 0.08687259674072266,
6315
- "mean_max_score": 0.67890625,
6316
- "mean_end_score": 0.6640625
6317
- },
6318
- "gpt-4o-mini-2024-07-18": {
6319
- "accuracy": 0.5984375,
6320
- "area_under_curve": 0.5628933527191842,
6321
- "loss": 0.10282745361328124,
6322
- "mean_max_score": 0.655625,
6323
- "mean_end_score": 0.644375
6324
- },
6325
- "Meta-Llama-3-8B-Instruct": {
6326
- "accuracy": 0.603125,
6327
- "area_under_curve": 0.5555893773327166,
6328
- "loss": 0.12582313537597656,
6329
- "mean_max_score": 0.618515625,
6330
- "mean_end_score": 0.578125
6331
- },
6332
- "claude-3-haiku-20240307": {
6333
- "accuracy": 0.5796874999999999,
6334
- "area_under_curve": 0.5637145211028964,
6335
- "loss": 0.13854501724243165,
6336
- "mean_max_score": 0.564296875,
6337
- "mean_end_score": 0.5390625
6338
- }
6339
- },
6340
  "data/ifeval_best_of_k/Starling-RM-7B-alpha.json": {
6341
  "all": {
6342
  "accuracy": 0.5406249999999999,
@@ -6411,43 +6300,6 @@
6411
  "mean_end_score": 0.484375
6412
  }
6413
  },
6414
- "data/ifeval_best_of_k/Llama-3.1-70B-RM-09172024.json": {
6415
- "all": {
6416
- "accuracy": 0.630078125,
6417
- "area_under_curve": 0.5902905300669057,
6418
- "loss": 0.09440629482269287,
6419
- "mean_max_score": 0.64310546875,
6420
- "mean_end_score": 0.62984375
6421
- },
6422
- "gemma-2-9b-it": {
6423
- "accuracy": 0.6375,
6424
- "area_under_curve": 0.6064561485832756,
6425
- "loss": 0.07111602783203125,
6426
- "mean_max_score": 0.709375,
6427
- "mean_end_score": 0.6953125
6428
- },
6429
- "gpt-4o-mini-2024-07-18": {
6430
- "accuracy": 0.6359374999999999,
6431
- "area_under_curve": 0.5804507982664724,
6432
- "loss": 0.08310569763183594,
6433
- "mean_max_score": 0.693203125,
6434
- "mean_end_score": 0.6759375
6435
- },
6436
- "Meta-Llama-3-8B-Instruct": {
6437
- "accuracy": 0.6468750000000001,
6438
- "area_under_curve": 0.5893750619966321,
6439
- "loss": 0.10088687896728515,
6440
- "mean_max_score": 0.653359375,
6441
- "mean_end_score": 0.6171875
6442
- },
6443
- "claude-3-haiku-20240307": {
6444
- "accuracy": 0.6000000000000001,
6445
- "area_under_curve": 0.585711467200442,
6446
- "loss": 0.12550268173217774,
6447
- "mean_max_score": 0.588984375,
6448
- "mean_end_score": 0.53125
6449
- }
6450
- },
6451
  "data/ifeval_best_of_k/Skywork-Reward-Gemma-2-27B.json": {
6452
  "all": {
6453
  "accuracy": 0.537890625,
 
5785
  "accuracy": 0.5723270440251572
5786
  }
5787
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5788
  "data/ifeval_best_of_k/internlm2-1_8b-reward.json": {
5789
  "all": {
5790
  "accuracy": 0.5386718749999999,
 
6152
  "mean_end_score": 0.606015625
6153
  }
6154
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6155
  "data/ifeval_best_of_k/nemotron-4-340b-reward.json": {
6156
  "all": {
6157
  "accuracy": 0.6265624999999999,
 
6226
  "mean_end_score": 0.578125
6227
  }
6228
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6229
  "data/ifeval_best_of_k/Starling-RM-7B-alpha.json": {
6230
  "all": {
6231
  "accuracy": 0.5406249999999999,
 
6300
  "mean_end_score": 0.484375
6301
  }
6302
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6303
  "data/ifeval_best_of_k/Skywork-Reward-Gemma-2-27B.json": {
6304
  "all": {
6305
  "accuracy": 0.537890625,