Spaces:
Runtime error
Runtime error
Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.
707a231
{"num_words_mean":257,"num_words_std":145,"win_rate":34.4633736232,"standard_error":1.3146665263,"n_wins":225.0,"n_wins_base":475.0,"n_draws":105.0,"n_total":805.0,"discrete_win_rate":34.4720496894,"length_controlled_winrate":41.8230717152,"lc_standard_error":0.7776876699,"num_tokens_mean":354,"num_tokens_std":206,"model_name":"aligner-2b_claude-3-opus-20240229"} | |
{"num_words_mean":253,"num_words_std":153,"win_rate":33.7770952757,"standard_error":1.3776163154,"n_wins":255.0,"n_wins_base":545.0,"n_draws":5.0,"n_total":805.0,"discrete_win_rate":31.9875776398,"length_controlled_winrate":43.905552211,"lc_standard_error":0.8945807936,"num_tokens_mean":346,"num_tokens_std":211,"model_name":"Qwen1.5-110B-Chat"} | |
{"num_words_mean":216,"num_words_std":113,"win_rate":29.1052695333,"standard_error":1.3941539442,"n_wins":223.0,"n_wins_base":579.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":27.8881987578,"length_controlled_winrate":40.5095080124,"lc_standard_error":0.8837504763,"num_tokens_mean":292,"num_tokens_std":156,"model_name":"claude-3-opus-20240229"} | |
{"num_words_mean":241,"num_words_std":166,"win_rate":21.8557725437,"standard_error":1.2682402187,"n_wins":164.0,"n_wins_base":639.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":20.4968944099,"length_controlled_winrate":28.6143374017,"lc_standard_error":0.9075464438,"num_tokens_mean":327,"num_tokens_std":305,"model_name":"mistral-medium"} | |
{"num_words_mean":174,"num_words_std":90,"win_rate":17.1882403567,"standard_error":1.1748282562,"n_wins":131.0,"n_wins_base":673.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":16.3354037267,"length_controlled_winrate":28.1551961416,"lc_standard_error":0.8779084794,"num_tokens_mean":227,"num_tokens_std":114,"model_name":"claude-2"} | |
{"num_words_mean":342,"num_words_std":187,"win_rate":35.9464864409,"standard_error":1.4410058098,"n_wins":285.0,"n_wins_base":517.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":35.5900621118,"length_controlled_winrate":34.787447623,"lc_standard_error":0.7594505141,"num_tokens_mean":505,"num_tokens_std":311,"model_name":"FsfairX-Zephyr-Chat-v0.1"} | |
{"num_words_mean":79,"num_words_std":78,"win_rate":15.7478281307,"standard_error":1.1194852006,"n_wins":118.0,"n_wins_base":687.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":14.6583850932,"length_controlled_winrate":25.5015577947,"lc_standard_error":0.7760697229,"num_tokens_mean":106,"num_tokens_std":102,"model_name":"Infinity-Instruct-3M-0613-Mistral-7B"} | |
{"num_words_mean":243,"num_words_std":144,"win_rate":26.4982833956,"standard_error":1.3042361649,"n_wins":201.0,"n_wins_base":600.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":25.2173913043,"length_controlled_winrate":36.571754112,"lc_standard_error":0.9357421321,"num_tokens_mean":342,"num_tokens_std":346,"model_name":"Qwen1.5-72B-Chat"} | |
{"num_words_mean":229,"num_words_std":168,"win_rate":22.2101705475,"standard_error":1.2780740057,"n_wins":174.0,"n_wins_base":628.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":21.801242236,"length_controlled_winrate":30.8788102941,"lc_standard_error":0.9518125819,"num_tokens_mean":307,"num_tokens_std":253,"model_name":"Mixtral-8x22B-Instruct-v0.1"} | |
{"num_words_mean":235,"num_words_std":143,"win_rate":19.7553327319,"standard_error":1.2063251121,"n_wins":147.0,"n_wins_base":657.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":18.3229813665,"length_controlled_winrate":25.1853410397,"lc_standard_error":0.8999456518,"num_tokens_mean":310,"num_tokens_std":193,"model_name":"dbrx-instruct"} | |
{"num_words_mean":297,"num_words_std":161,"win_rate":56.5930456223,"standard_error":1.4464848562,"n_wins":456.0,"n_wins_base":347.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":56.7701863354,"length_controlled_winrate":59.1415240989,"lc_standard_error":0.7580510219,"num_tokens_mean":420,"num_tokens_std":233,"model_name":"Together-MoA-Lite"} | |
{"num_words_mean":299,"num_words_std":171,"win_rate":33.1778569588,"standard_error":1.3886514096,"n_wins":266.0,"n_wins_base":537.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":33.1677018634,"length_controlled_winrate":34.4245971745,"lc_standard_error":0.8691832384,"num_tokens_mean":416,"num_tokens_std":238,"model_name":"Meta-Llama-3-70B-Instruct"} | |
{"num_words_mean":179,"num_words_std":104,"win_rate":16.1273996216,"standard_error":1.1341036838,"n_wins":120.0,"n_wins_base":682.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":15.0931677019,"length_controlled_winrate":25.6122590254,"lc_standard_error":0.87464248,"num_tokens_mean":233,"num_tokens_std":132,"model_name":"claude-instant-1.2"} | |
{"num_words_mean":272,"num_words_std":151,"win_rate":59.8688062333,"standard_error":1.4343056045,"n_wins":490.0,"n_wins_base":314.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":60.9316770186,"length_controlled_winrate":65.3799697685,"lc_standard_error":0.7392392837,"num_tokens_mean":386,"num_tokens_std":217,"model_name":"Together-MoA"} | |
{"num_words_mean":218,"num_words_std":148,"win_rate":21.4387759814,"standard_error":1.2485232545,"n_wins":166.0,"n_wins_base":638.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":20.6832298137,"length_controlled_winrate":32.6520799853,"lc_standard_error":0.9044632955,"num_tokens_mean":290,"num_tokens_std":191,"model_name":"mistral-large-2402"} | |
{"num_words_mean":336,"num_words_std":195,"win_rate":63.0409907519,"standard_error":1.4253258915,"n_wins":519.0,"n_wins_base":286.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":64.4720496894,"length_controlled_winrate":61.637895572,"lc_standard_error":0.6799412402,"num_tokens_mean":486,"num_tokens_std":287,"model_name":"Storm-7B-best-of-64"} | |
{"num_words_mean":221,"num_words_std":129,"win_rate":25.5563252923,"standard_error":1.3419811052,"n_wins":193.0,"n_wins_base":608.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":24.2236024845,"length_controlled_winrate":34.8724743624,"lc_standard_error":0.949844689,"num_tokens_mean":297,"num_tokens_std":178,"model_name":"claude-3-sonnet-20240229"} | |
{"num_words_mean":176,"num_words_std":97,"win_rate":16.9853436124,"standard_error":1.1687959793,"n_wins":129.0,"n_wins_base":676.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":16.0248447205,"length_controlled_winrate":27.2895044437,"lc_standard_error":0.858614564,"num_tokens_mean":229,"num_tokens_std":123,"model_name":"claude"} | |
{"num_words_mean":415,"num_words_std":173,"win_rate":39.354502072,"standard_error":1.4524224246,"n_wins":323.0,"n_wins_base":480.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":40.248447205,"length_controlled_winrate":25.2417704867,"lc_standard_error":0.5909370499,"num_tokens_mean":561,"num_tokens_std":239,"model_name":"Nanbeige2-8B-Chat"} | |
{"num_words_mean":341,"num_words_std":273,"win_rate":34.3064238313,"standard_error":1.3914900256,"n_wins":268.0,"n_wins_base":537.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":33.2919254658,"length_controlled_winrate":31.4699427971,"lc_standard_error":0.8138922262,"num_tokens_mean":509,"num_tokens_std":463,"model_name":"REBEL-Llama-3-8B-Instruct"} | |
{"num_words_mean":286,"num_words_std":190,"win_rate":51.3275757825,"standard_error":1.470009459,"n_wins":429.0,"n_wins_base":369.0,"n_draws":7.0,"n_total":805.0,"discrete_win_rate":53.7267080745,"length_controlled_winrate":57.4568288333,"lc_standard_error":0.7774399385,"num_tokens_mean":406,"num_tokens_std":273,"model_name":"gpt-4o-2024-05-13"} | |
{"num_words_mean":378,"num_words_std":190,"win_rate":64.303601471,"standard_error":1.3348590089,"n_wins":525.0,"n_wins_base":268.0,"n_draws":12.0,"n_total":805.0,"discrete_win_rate":65.9627329193,"length_controlled_winrate":51.5750079797,"lc_standard_error":0.8313707608,"num_tokens_mean":505,"num_tokens_std":261,"model_name":"gpt4_1106_preview_verbose"} | |
{"num_words_mean":284,"num_words_std":151,"win_rate":37.0360860499,"standard_error":1.4340261273,"n_wins":288.0,"n_wins_base":514.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":35.9627329193,"length_controlled_winrate":40.5912863493,"lc_standard_error":0.8504106275,"num_tokens_mean":394,"num_tokens_std":214,"model_name":"Nanbeige2-16B-Chat"} | |
{"num_words_mean":183,"num_words_std":124,"win_rate":15.7550380876,"standard_error":1.0754642482,"n_wins":117.0,"n_wins_base":684.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":14.7826086957,"length_controlled_winrate":30.1833223167,"lc_standard_error":0.7874508454,"num_tokens_mean":245,"num_tokens_std":164,"model_name":"gpt4_0613"} | |
{"num_words_mean":313,"num_words_std":180,"win_rate":54.9665397329,"standard_error":1.4286740089,"n_wins":446.0,"n_wins_base":347.0,"n_draws":12.0,"n_total":805.0,"discrete_win_rate":56.149068323,"length_controlled_winrate":56.3562938462,"lc_standard_error":0.7731843456,"num_tokens_mean":417,"num_tokens_std":242,"model_name":"gpt-4-0125-preview"} | |
{"num_words_mean":237,"num_words_std":127,"win_rate":23.2373600435,"standard_error":1.2835395056,"n_wins":171.0,"n_wins_base":630.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":21.4906832298,"length_controlled_winrate":33.8212668866,"lc_standard_error":0.8842151461,"num_tokens_mean":313,"num_tokens_std":166,"model_name":"gpt4_0613_verbose"} | |
{"num_words_mean":391,"num_words_std":159,"win_rate":56.7030097302,"standard_error":1.482841875,"n_wins":456.0,"n_wins_base":347.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":56.7701863354,"length_controlled_winrate":44.4596624034,"lc_standard_error":0.7209678864,"num_tokens_mean":529,"num_tokens_std":216,"model_name":"Nanbeige-Plus-Chat-v0.1"} | |
{"num_words_mean":393,"num_words_std":185,"win_rate":34.8601328913,"standard_error":1.3599450437,"n_wins":270.0,"n_wins_base":533.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":33.6645962733,"length_controlled_winrate":29.9743216131,"lc_standard_error":0.7464891533,"num_tokens_mean":526,"num_tokens_std":255,"model_name":"Snorkel-Mistral-PairRM-DPO-best-of-16"} | |
{"num_words_mean":228,"num_words_std":94,"win_rate":24.3540710901,"standard_error":1.29358621,"n_wins":191.0,"n_wins_base":613.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":23.7888198758,"length_controlled_winrate":30.2911791666,"lc_standard_error":0.6612722747,"num_tokens_mean":292,"num_tokens_std":120,"model_name":"claude-2.1_verbose"} | |
{"num_words_mean":281,"num_words_std":128,"win_rate":29.8963508407,"standard_error":1.3666520485,"n_wins":234.0,"n_wins_base":571.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":29.0683229814,"length_controlled_winrate":31.721885287,"lc_standard_error":0.8150560619,"num_tokens_mean":385,"num_tokens_std":180,"model_name":"merlinite-7B-AOT"} | |
{"num_words_mean":215,"num_words_std":160,"win_rate":22.0732589287,"standard_error":1.2466725495,"n_wins":172.0,"n_wins_base":627.0,"n_draws":6.0,"n_total":805.0,"discrete_win_rate":21.7391304348,"length_controlled_winrate":35.3070612164,"lc_standard_error":0.8997916758,"num_tokens_mean":289,"num_tokens_std":212,"model_name":"gpt4_0314"} | |
{"num_words_mean":381,"num_words_std":205,"win_rate":33.2273552,"standard_error":1.3779687478,"n_wins":260.0,"n_wins_base":544.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":32.3602484472,"length_controlled_winrate":29.7058089397,"lc_standard_error":0.7122554396,"num_tokens_mean":506,"num_tokens_std":272,"model_name":"Contextual-KTO-Mistral-PairRM"} | |
{"num_words_mean":344,"num_words_std":144,"win_rate":35.4431306717,"standard_error":1.3981308966,"n_wins":274.0,"n_wins_base":531.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":34.0372670807,"length_controlled_winrate":31.9003876312,"lc_standard_error":0.7655500294,"num_tokens_mean":459,"num_tokens_std":193,"model_name":"SPPO-Mistral7B-PairRM-ExPO"} | |
{"num_words_mean":177,"num_words_std":130,"win_rate":22.9201944405,"standard_error":1.2325177143,"n_wins":172.0,"n_wins_base":622.0,"n_draws":11.0,"n_total":805.0,"discrete_win_rate":22.049689441,"length_controlled_winrate":41.8966015912,"lc_standard_error":0.7406558917,"num_tokens_mean":244,"num_tokens_std":184,"model_name":"gpt4_1106_preview_concise"} | |
{"num_words_mean":261,"num_words_std":134,"win_rate":40.6328540086,"standard_error":1.4439449942,"n_wins":325.0,"n_wins_base":479.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":40.4347826087,"length_controlled_winrate":45.807978034,"lc_standard_error":0.8703329817,"num_tokens_mean":370,"num_tokens_std":191,"model_name":"Llama-3-Instruct-8B-SimPO-ExPO"} | |
{"num_words_mean":272,"num_words_std":137,"win_rate":63.1549345123,"standard_error":1.4229800988,"n_wins":515.0,"n_wins_base":283.0,"n_draws":7.0,"n_total":805.0,"discrete_win_rate":64.4099378882,"length_controlled_winrate":68.3786625033,"lc_standard_error":0.7309418615,"num_tokens_mean":377,"num_tokens_std":193,"model_name":"openpipe-moa-gpt-4-turbo-v1"} | |
{"num_words_mean":225,"num_words_std":140,"win_rate":26.9882543183,"standard_error":1.318903,"n_wins":201.0,"n_wins_base":601.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":25.1552795031,"length_controlled_winrate":31.5065442681,"lc_standard_error":0.7338723477,"num_tokens_mean":312,"num_tokens_std":199,"model_name":"Samba-CoE-v0.2-best-of-16"} | |
{"num_words_mean":280,"num_words_std":151,"win_rate":31.773037737,"standard_error":1.2392772646,"n_wins":180.0,"n_wins_base":473.0,"n_draws":152.0,"n_total":805.0,"discrete_win_rate":31.801242236,"length_controlled_winrate":36.7258688784,"lc_standard_error":0.6787999003,"num_tokens_mean":383,"num_tokens_std":210,"model_name":"aligner-2b_qwen1.5-72b-chat"} | |
{"num_words_mean":177,"num_words_std":92,"win_rate":15.7335067364,"standard_error":1.1203158654,"n_wins":115.0,"n_wins_base":688.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":14.4099378882,"length_controlled_winrate":25.2519438861,"lc_standard_error":0.7515108894,"num_tokens_mean":228,"num_tokens_std":114,"model_name":"claude-2.1"} | |
{"num_words_mean":317,"num_words_std":148,"win_rate":39.6728609061,"standard_error":1.4247223562,"n_wins":310.0,"n_wins_base":494.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":38.5714285714,"length_controlled_winrate":38.5628066368,"lc_standard_error":0.8694594533,"num_tokens_mean":443,"num_tokens_std":208,"model_name":"SPPO-Llama-3-Instruct-8B-PairRM"} | |
{"num_words_mean":272,"num_words_std":141,"win_rate":40.5297749846,"standard_error":1.4225744647,"n_wins":319.0,"n_wins_base":485.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":39.6894409938,"length_controlled_winrate":44.6804680926,"lc_standard_error":0.8789917177,"num_tokens_mean":385,"num_tokens_std":200,"model_name":"Llama-3-Instruct-8B-SimPO"} | |
{"num_words_mean":276,"num_words_std":140,"win_rate":22.9806197059,"standard_error":1.3591734083,"n_wins":184.0,"n_wins_base":620.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":22.9192546584,"length_controlled_winrate":25.7233081711,"lc_standard_error":0.4593179402,"num_tokens_mean":368,"num_tokens_std":203,"model_name":"tulu-2-dpo-70b-ExPO"} | |
{"num_words_mean":228,"num_words_std":142,"win_rate":40.5602140968,"standard_error":1.4679655404,"n_wins":312.0,"n_wins_base":493.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":38.7577639752,"length_controlled_winrate":52.3667542714,"lc_standard_error":0.7976856335,"num_tokens_mean":315,"num_tokens_std":200,"model_name":"claude-3-5-sonnet-20240620"} | |
{"num_words_mean":300,"num_words_std":135,"win_rate":50.2688690553,"standard_error":1.4728176781,"n_wins":397.0,"n_wins_base":408.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":49.3167701863,"length_controlled_winrate":50.4080792281,"lc_standard_error":0.7188927916,"num_tokens_mean":429,"num_tokens_std":188,"model_name":"Storm-7B"} | |
{"num_words_mean":414,"num_words_std":241,"win_rate":30.2200527007,"standard_error":1.3328273013,"n_wins":231.0,"n_wins_base":572.0,"n_draws":1.0,"n_total":804.0,"discrete_win_rate":28.7935323383,"length_controlled_winrate":26.3914464573,"lc_standard_error":0.6739888325,"num_tokens_mean":566,"num_tokens_std":342,"model_name":"Snorkel-Mistral-PairRM-DPO"} | |
{"num_words_mean":507,"num_words_std":206,"win_rate":46.1853674689,"standard_error":1.4638315246,"n_wins":375.0,"n_wins_base":430.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":46.5838509317,"length_controlled_winrate":27.2257594808,"lc_standard_error":0.5877331102,"num_tokens_mean":693,"num_tokens_std":283,"model_name":"internlm2-chat-20b-ExPO"} | |
{"num_words_mean":210,"num_words_std":138,"win_rate":21.8473786693,"standard_error":1.2171089783,"n_wins":159.0,"n_wins_base":645.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":19.8136645963,"length_controlled_winrate":27.6242673501,"lc_standard_error":0.6875926799,"num_tokens_mean":292,"num_tokens_std":196,"model_name":"Samba-CoE-v0.2"} | |
{"num_words_mean":215,"num_words_std":156,"win_rate":23.5767893148,"standard_error":1.2757042012,"n_wins":179.0,"n_wins_base":618.0,"n_draws":8.0,"n_total":805.0,"discrete_win_rate":22.7329192547,"length_controlled_winrate":38.1280897444,"lc_standard_error":0.9069675584,"num_tokens_mean":287,"num_tokens_std":203,"model_name":"gpt4"} | |
{"num_words_mean":339,"num_words_std":178,"win_rate":29.6599467188,"standard_error":1.3225712598,"n_wins":219.0,"n_wins_base":582.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":27.4534161491,"length_controlled_winrate":27.1905478776,"lc_standard_error":0.7470363322,"num_tokens_mean":447,"num_tokens_std":235,"model_name":"Yi-34B-Chat"} | |
{"num_words_mean":336,"num_words_std":106,"win_rate":29.6008518479,"standard_error":1.3252049543,"n_wins":225.0,"n_wins_base":580.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":27.950310559,"length_controlled_winrate":26.4869564984,"lc_standard_error":0.7549415682,"num_tokens_mean":447,"num_tokens_std":143,"model_name":"Starling-LM-7B-beta-ExPO"} | |
{"num_words_mean":349,"num_words_std":189,"win_rate":31.2412829468,"standard_error":1.3482437399,"n_wins":239.0,"n_wins_base":563.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":29.8757763975,"length_controlled_winrate":28.8148408668,"lc_standard_error":0.8310750322,"num_tokens_mean":464,"num_tokens_std":276,"model_name":"pairrm-Yi-34B-Chat"} | |
{"num_words_mean":323,"num_words_std":181,"win_rate":50.0,"standard_error":0.0,"n_wins":0.0,"n_wins_base":0.0,"n_draws":805.0,"n_total":805.0,"discrete_win_rate":50.0,"length_controlled_winrate":50.0,"lc_standard_error":0.0,"num_tokens_mean":431,"num_tokens_std":249,"model_name":"gpt4_1106_preview"} | |
{"num_words_mean":322,"num_words_std":148,"win_rate":32.2453123638,"standard_error":1.390800011,"n_wins":249.0,"n_wins_base":556.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":30.9316770186,"length_controlled_winrate":30.4941379652,"lc_standard_error":0.8458266977,"num_tokens_mean":431,"num_tokens_std":198,"model_name":"SPPO-Mistral7B-PairRM"} | |