eduagarcia commited on
Commit
272ff3e
1 Parent(s): 66b67db

Modify app.py to read and display the new multilingual results

Browse files
Files changed (1) hide show
  1. app.py +166 -142
app.py CHANGED
@@ -9,30 +9,32 @@ import numpy as np
9
  import pandas as pd
10
 
11
 
12
- # notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK
13
  notebook_url = "https://colab.research.google.com/drive/11eWOT3VAAWRRrs1CSsAg84hIaJvH2ThK?usp=sharing"
14
-
 
15
 
16
  basic_component_values = [None] * 6
17
  leader_component_values = [None] * 5
18
 
19
 
20
- def make_default_md(arena_df, elo_results):
21
- total_votes = sum(arena_df["num_battles"]) // 2
22
- total_models = len(arena_df)
23
-
24
  leaderboard_md = f"""
25
- # 🏆 LMSYS Chatbot Arena Leaderboard
26
- | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
 
28
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
- We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
 
 
 
 
30
  """
31
  return leaderboard_md
32
 
33
 
34
  def make_arena_leaderboard_md(arena_df):
35
- total_votes = sum(arena_df["num_battles"]) // 2
36
  total_models = len(arena_df)
37
 
38
  leaderboard_md = f"""
@@ -46,8 +48,8 @@ Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! Find m
46
  def make_full_leaderboard_md(elo_results):
47
  leaderboard_md = f"""
48
  Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
49
- - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute Elo ratings.
50
- - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
51
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
52
 
53
  💻 Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
@@ -220,6 +222,8 @@ def get_arena_table(arena_df, model_table_df):
220
  # model display name
221
  row.append(model_name)
222
  # elo rating
 
 
223
  row.append(round(arena_df.iloc[i]["rating"]))
224
  upper_diff = round(
225
  arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
@@ -228,6 +232,8 @@ def get_arena_table(arena_df, model_table_df):
228
  arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
229
  )
230
  row.append(f"+{upper_diff}/-{lower_diff}")
 
 
231
  # num battles
232
  row.append(round(arena_df.iloc[i]["num_battles"]))
233
  # Organization
@@ -239,139 +245,159 @@ def get_arena_table(arena_df, model_table_df):
239
  model_table_df[model_table_df["key"] == model_key]["License"].values[0]
240
  )
241
 
242
- cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
243
- if cutoff_date == "-":
244
- row.append("Unknown")
245
- else:
246
- row.append(cutoff_date)
 
247
  values.append(row)
248
  return values
249
 
250
- def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
251
- if elo_results_file is None: # Do live update
252
- default_md = "Loading ..."
253
- p1 = p2 = p3 = p4 = None
254
- else:
255
- with open(elo_results_file, "rb") as fin:
256
- elo_results = pickle.load(fin)
257
- if "full" in elo_results:
258
- elo_results = elo_results["full"]
259
-
260
- p1 = elo_results["win_fraction_heatmap"]
261
- p2 = elo_results["battle_count_heatmap"]
262
- p3 = elo_results["bootstrap_elo_rating"]
263
- p4 = elo_results["average_win_rate_bar"]
264
- arena_df = elo_results["leaderboard_table_df"]
265
- default_md = make_default_md(arena_df, elo_results)
266
-
267
- md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
268
- if leaderboard_table_file:
269
- data = load_leaderboard_table_csv(leaderboard_table_file)
270
- model_table_df = pd.DataFrame(data)
271
-
272
- with gr.Tabs() as tabs:
273
- # arena table
274
- arena_table_vals = get_arena_table(arena_df, model_table_df)
275
- with gr.Tab("Arena Elo", id=0):
276
- md = make_arena_leaderboard_md(arena_df)
277
- gr.Markdown(md, elem_id="leaderboard_markdown")
278
- gr.Dataframe(
279
- headers=[
280
- "Rank",
281
- "🤖 Model",
282
- "⭐ Arena Elo",
283
- "📊 95% CI",
284
- "🗳️ Votes",
285
- "Organization",
286
- "License",
287
- "Knowledge Cutoff",
288
- ],
289
- datatype=[
290
- "str",
291
- "markdown",
292
- "number",
293
- "str",
294
- "number",
295
- "str",
296
- "str",
297
- "str",
298
- ],
299
- value=arena_table_vals,
300
- elem_id="arena_leaderboard_dataframe",
301
- height=700,
302
- column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
303
- wrap=True,
304
- )
305
- with gr.Tab("Full Leaderboard", id=1):
306
- md = make_full_leaderboard_md(elo_results)
307
- gr.Markdown(md, elem_id="leaderboard_markdown")
308
- full_table_vals = get_full_table(arena_df, model_table_df)
309
- gr.Dataframe(
310
- headers=[
311
- "🤖 Model",
312
- "⭐ Arena Elo",
313
- "📈 MT-bench",
314
- "📚 MMLU",
315
- "Organization",
316
- "License",
317
- ],
318
- datatype=["markdown", "number", "number", "number", "str", "str"],
319
- value=full_table_vals,
320
- elem_id="full_leaderboard_dataframe",
321
- column_widths=[200, 100, 100, 100, 150, 150],
322
- height=700,
323
- wrap=True,
324
- )
325
- if not show_plot:
326
- gr.Markdown(
327
- """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis!
328
- If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
329
- """,
330
- elem_id="leaderboard_markdown",
331
- )
332
- else:
333
- pass
334
-
335
- gr.Markdown(
336
- f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
337
- A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
338
- See Figure 3 below for visualization of the confidence intervals.
339
- """,
340
- elem_id="leaderboard_markdown"
341
  )
342
 
343
- leader_component_values[:] = [default_md, p1, p2, p3, p4]
 
 
 
 
 
 
344
 
345
- if show_plot:
346
  gr.Markdown(
347
- f"""## More Statistics for Chatbot Arena\n
348
- Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
349
- You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
350
- """,
 
 
 
 
 
 
 
351
  elem_id="leaderboard_markdown"
352
  )
 
 
 
 
 
 
 
353
  with gr.Row():
354
  with gr.Column():
355
  gr.Markdown(
356
- "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
357
  )
358
  plot_1 = gr.Plot(p1, show_label=False)
 
359
  with gr.Column():
360
  gr.Markdown(
361
- "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
362
  )
363
  plot_2 = gr.Plot(p2, show_label=False)
 
364
  with gr.Row():
365
  with gr.Column():
366
  gr.Markdown(
367
- "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)"
368
  )
369
  plot_3 = gr.Plot(p3, show_label=False)
 
370
  with gr.Column():
371
  gr.Markdown(
372
- "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
373
  )
374
  plot_4 = gr.Plot(p4, show_label=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  with gr.Accordion(
377
  "📝 Citation",
@@ -379,7 +405,7 @@ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12
379
  ):
380
  citation_md = """
381
  ### Citation
382
- Please cite the following paper if you find our leaderboard or dataset helpful.
383
  ```
384
  @misc{chiang2024chatbot,
385
  title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
@@ -431,20 +457,12 @@ footer {
431
  }
432
  """
433
 
434
- acknowledgment_md = """
435
  ### Acknowledgment
436
- We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a16z](https://www.a16z.com/), [Together AI](https://www.together.ai/), [Anyscale](https://www.anyscale.com/), [HuggingFace](https://huggingface.co/) for their generous [sponsorship](https://lmsys.org/donations/).
437
-
438
- <div class="sponsor-image-about">
439
- <img src="https://storage.googleapis.com/public-arena-asset/kaggle.png" alt="Kaggle">
440
- <img src="https://storage.googleapis.com/public-arena-asset/mbzuai.jpeg" alt="MBZUAI">
441
- <img src="https://storage.googleapis.com/public-arena-asset/a16z.jpeg" alt="a16z">
442
- <img src="https://storage.googleapis.com/public-arena-asset/together.png" alt="Together AI">
443
- <img src="https://storage.googleapis.com/public-arena-asset/anyscale.png" alt="AnyScale">
444
- <img src="https://storage.googleapis.com/public-arena-asset/huggingface.png" alt="HuggingFace">
445
- </div>
446
  """
447
 
 
448
  def build_demo(elo_results_file, leaderboard_table_file):
449
  text_size = gr.themes.sizes.text_lg
450
 
@@ -457,20 +475,26 @@ def build_demo(elo_results_file, leaderboard_table_file):
457
  elo_results_file, leaderboard_table_file, show_plot=True
458
  )
459
  return demo
 
460
 
 
 
 
461
 
462
- if __name__ == "__main__":
463
- parser = argparse.ArgumentParser()
464
- parser.add_argument("--share", action="store_true")
465
- args = parser.parse_args()
466
 
467
- elo_result_files = glob.glob("elo_results_*.pkl")
468
- elo_result_files.sort(key=lambda x: int(x[12:-4]))
469
- elo_result_file = elo_result_files[-1]
470
 
471
- leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
472
- leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
473
- leaderboard_table_file = leaderboard_table_files[-1]
 
 
 
 
 
474
 
475
- demo = build_demo(elo_result_file, leaderboard_table_file)
476
- demo.launch(share=args.share)
 
9
  import pandas as pd
10
 
11
 
12
+ original_notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
13
  notebook_url = "https://colab.research.google.com/drive/11eWOT3VAAWRRrs1CSsAg84hIaJvH2ThK?usp=sharing"
14
+ data_link = "https://drive.google.com/file/d/1_72443egRzwRTmJfIyOQcf1ug7sKbqbX/view?usp=sharing"
15
+ original_leaderboard_link = "{original_leaderboard_link}"
16
 
17
  basic_component_values = [None] * 6
18
  leader_component_values = [None] * 5
19
 
20
 
21
+ def make_default_md(languages_names):
 
 
 
22
  leaderboard_md = f"""
23
+ # 🏆 Multilingual LMSYS Chatbot Arena Leaderboard
24
+ LMSYS Org link's: | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
25
 
26
  LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
27
+ They've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
28
+
29
+ This leaderboard is a fork derived from the [🏆LMSYS Chatbot Arena Leaderboard]({original_leaderboard_link}). The LMSYS Org provides [data]({original_notebook_url}) that contains the language inferred for each conversation using the polyglot package, we use this data for featuring additional metrics and analysis for each individual language, with a particular emphasis on non-English languages.
30
+
31
+ In the "By Language" section, we offer individual metrics for the following languages: {", ".join(languages_names[:-1])}, and {languages_names[-1]}.
32
  """
33
  return leaderboard_md
34
 
35
 
36
  def make_arena_leaderboard_md(arena_df):
37
+ total_votes = int(sum(arena_df["num_battles"]) // 2)
38
  total_models = len(arena_df)
39
 
40
  leaderboard_md = f"""
 
48
  def make_full_leaderboard_md(elo_results):
49
  leaderboard_md = f"""
50
  Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
51
+ - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. They use 500K+ user votes to compute Elo ratings.
52
+ - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. They use GPT-4 to grade the model responses.
53
  - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
54
 
55
  💻 Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
 
222
  # model display name
223
  row.append(model_name)
224
  # elo rating
225
+ if pd.isna(arena_df.iloc[i]["rating"]):
226
+ continue
227
  row.append(round(arena_df.iloc[i]["rating"]))
228
  upper_diff = round(
229
  arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
 
232
  arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
233
  )
234
  row.append(f"+{upper_diff}/-{lower_diff}")
235
+ # Avg. Win Rate
236
+ row.append(f'{round(arena_df.iloc[i]["avg_win_rate_no_tie"] * 100, 1):04.1f}%')
237
  # num battles
238
  row.append(round(arena_df.iloc[i]["num_battles"]))
239
  # Organization
 
245
  model_table_df[model_table_df["key"] == model_key]["License"].values[0]
246
  )
247
 
248
+ #cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
249
+ #if cutoff_date == "-":
250
+ # row.append("Unknown")
251
+ #else:
252
+ # row.append(cutoff_date)
253
+
254
  values.append(row)
255
  return values
256
 
257
+ def create_leaderboard_from_results(elo_results, model_table_df, show_plot, show_language_plot=False):
258
+ p0 = elo_results["inferred_languages_bar"]
259
+ p1 = elo_results["win_fraction_heatmap"]
260
+ p2 = elo_results["battle_count_heatmap"]
261
+ p3 = elo_results["bootstrap_elo_rating"]
262
+ p4 = elo_results["average_win_rate_bar"]
263
+ arena_df = elo_results["leaderboard_table_df"]
264
+ arena_table_vals = get_arena_table(arena_df, model_table_df)
265
+
266
+ md = make_arena_leaderboard_md(arena_df)
267
+ gr.Markdown(md, elem_id="leaderboard_markdown")
268
+ gr.Dataframe(
269
+ headers=[
270
+ "Rank",
271
+ "🤖 Model",
272
+ "⭐ Arena Elo",
273
+ "📊 95% CI",
274
+ "🏆 Avg. Win Rate",
275
+ "🗳️ Votes",
276
+ "Organization",
277
+ "License",
278
+ #"Knowledge Cutoff",
279
+ ],
280
+ datatype=[
281
+ "str",
282
+ "markdown",
283
+ "number",
284
+ "str",
285
+ "str",
286
+ "number",
287
+ "str",
288
+ "str",
289
+ #"str",
290
+ ],
291
+ value=arena_table_vals,
292
+ elem_id="arena_leaderboard_dataframe",
293
+ height=700,
294
+ column_widths=[50, 200, 120, 100, 150, 100, 125, 125],#, 100],
295
+ wrap=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  )
297
 
298
+ gr.Markdown(
299
+ f"""Note¹: we take the 95% confidence interval into account when determining a model's ranking.
300
+ A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score. See Figure {3+int(show_language_plot)} below for visualization of the confidence intervals.
301
+ Note²: The Average Win Rate is calculated by assuming uniform sampling and no ties.
302
+ """,
303
+ elem_id="leaderboard_markdown"
304
+ )
305
 
306
+ if not show_plot:
307
  gr.Markdown(
308
+ f""" ## Visit our [HF space]({original_leaderboard_link}) for more analysis!
309
+ If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
310
+ """,
311
+ elem_id="leaderboard_markdown",
312
+ )
313
+ else:
314
+ gr.Markdown(
315
+ f"""## More Statistics for Chatbot Arena\n
316
+ Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
317
+ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
318
+ """,
319
  elem_id="leaderboard_markdown"
320
  )
321
+ fig_id = 1
322
+ if show_language_plot:
323
+ gr.Markdown(
324
+ f"#### Figure {fig_id}: Battle counts for the Top 15 Languages"
325
+ )
326
+ plot_0 = gr.Plot(p0, show_label=False)
327
+ fig_id += 1
328
  with gr.Row():
329
  with gr.Column():
330
  gr.Markdown(
331
+ f"#### Figure {fig_id}: Fraction of Model A Wins for All Non-tied A vs. B Battles"
332
  )
333
  plot_1 = gr.Plot(p1, show_label=False)
334
+ fig_id += 1
335
  with gr.Column():
336
  gr.Markdown(
337
+ f"#### Figure {fig_id}: Battle Count for Each Combination of Models (without Ties)"
338
  )
339
  plot_2 = gr.Plot(p2, show_label=False)
340
+ fig_id += 1
341
  with gr.Row():
342
  with gr.Column():
343
  gr.Markdown(
344
+ f"#### Figure {fig_id}: Confidence Intervals on Model Strength (via Bootstrapping)"
345
  )
346
  plot_3 = gr.Plot(p3, show_label=False)
347
+ fig_id += 1
348
  with gr.Column():
349
  gr.Markdown(
350
+ f"#### Figure {fig_id}: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
351
  )
352
  plot_4 = gr.Plot(p4, show_label=False)
353
+ fig_id += 1
354
+
355
+ return p1, p2, p3, p4, plot_1, plot_2, plot_3, plot_4
356
+
357
+ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
358
+ if elo_results_file is None: # Do live update
359
+ default_md = "Loading ..."
360
+ p1 = p2 = p3 = p4 = None
361
+ else:
362
+ with open(elo_results_file, "rb") as fin:
363
+ elo_results = pickle.load(fin)
364
+ #if "non-english" in elo_results:
365
+ # elo_results = elo_results["non-english"]
366
+
367
+ languages = [lang for lang in elo_results if lang not in ["non-english", "full"]]
368
+ languages = languages[::-1][:-3]
369
+ languages_names = [lang[0].upper() + lang[1:] for lang in languages]
370
+
371
+ default_md = make_default_md(languages_names)
372
+ md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
373
+
374
+ if leaderboard_table_file:
375
+ data = load_leaderboard_table_csv(leaderboard_table_file)
376
+ model_table_df = pd.DataFrame(data)
377
+
378
+ with gr.Tabs() as tabs:
379
+ # arena table
380
+ with gr.Tab("Multilingual (Non-English)", id=0):
381
+ gr.Markdown("This section includes metrics for all interactions that are not in English. See Figure 1 below for the distribution of evaluated languages.")
382
+ p1, p2, p3, p4, plot_1, plot_2, plot_3, plot_4 = create_leaderboard_from_results(elo_results["non-english"], model_table_df, show_plot, show_language_plot=True)
383
+ with gr.Tab("Multilingual (All langs)", id=1):
384
+ gr.Markdown(f"This section includes metrics for all interactions, should be the same as the original [🏆LMSYS Chatbot Arena Leaderboard]({original_leaderboard_link}). See Figure 1 below for the distribution of evaluated languages.")
385
+ create_leaderboard_from_results(elo_results['full'], model_table_df, show_plot, show_language_plot=True)
386
+ with gr.Tab("By Language", id=2):
387
+ with gr.Tabs() as tabs:
388
+ for i, lang in enumerate(languages):
389
+ elo_result = elo_results[lang]
390
+ lang = lang[0].upper() + lang[1:]
391
+ arena_df = elo_result['leaderboard_table_df']
392
+ size = round((sum(arena_df['num_battles']) // 2) / 1000)
393
+ with gr.Tab(lang + f" ({size}K)", id=i+3):
394
+ gr.Markdown(f"This section includes metrics for all interactions that are in {lang}.")
395
+ create_leaderboard_from_results(elo_result, model_table_df, show_plot)
396
+
397
+ else:
398
+ pass
399
+
400
+ leader_component_values[:] = [default_md, p1, p2, p3, p4]
401
 
402
  with gr.Accordion(
403
  "📝 Citation",
 
405
  ):
406
  citation_md = """
407
  ### Citation
408
+ Please cite the following paper if you find the leaderboard or dataset helpful.
409
  ```
410
  @misc{chiang2024chatbot,
411
  title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
 
457
  }
458
  """
459
 
460
+ acknowledgment_md = f"""
461
  ### Acknowledgment
462
+ Thanks to LMSYS team for providing the open-source [data]({original_notebook_url}) and the original [🏆LMSYS Chatbot Arena Leaderboard]({original_leaderboard_link}).
 
 
 
 
 
 
 
 
 
463
  """
464
 
465
+ '''
466
  def build_demo(elo_results_file, leaderboard_table_file):
467
  text_size = gr.themes.sizes.text_lg
468
 
 
475
  elo_results_file, leaderboard_table_file, show_plot=True
476
  )
477
  return demo
478
+ '''
479
 
480
+ elo_result_files = glob.glob("elo_results_*.pkl")
481
+ elo_result_files.sort(key=lambda x: int(x[12:-4]))
482
+ elo_result_file = elo_result_files[-1]
483
 
484
+ leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
485
+ leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
486
+ leaderboard_table_file = leaderboard_table_files[-1]
 
487
 
488
+ text_size = gr.themes.sizes.text_lg
 
 
489
 
490
+ with gr.Blocks(
491
+ title="Chatbot Arena Leaderboard",
492
+ theme=gr.themes.Base(text_size=text_size),
493
+ css=block_css,
494
+ ) as demo:
495
+ leader_components = build_leaderboard_tab(
496
+ elo_result_file, leaderboard_table_file, show_plot=True
497
+ )
498
 
499
+ if __name__ == "__main__":
500
+ demo.launch()