Spaces:
Sleeping
Sleeping
Merge branch 'main' of hf.co:spaces/lmsys/chatbot-arena-leaderboard into main
Browse files
app.py
CHANGED
@@ -8,7 +8,8 @@ import gradio as gr
|
|
8 |
import numpy as np
|
9 |
|
10 |
|
11 |
-
notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
|
|
|
12 |
|
13 |
|
14 |
basic_component_values = [None] * 6
|
@@ -21,7 +22,7 @@ def make_leaderboard_md(elo_results):
|
|
21 |
| [Vote](https://chat.lmsys.org/?arena) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
22 |
|
23 |
๐ This leaderboard is based on the following three benchmarks.
|
24 |
-
- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use
|
25 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
26 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
27 |
|
@@ -227,7 +228,7 @@ Please note that you may see different orders from different ranking methods. Th
|
|
227 |
with gr.Row():
|
228 |
with gr.Column():
|
229 |
gr.Markdown(
|
230 |
-
"#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
|
231 |
)
|
232 |
plot_3 = gr.Plot(p3, show_label=False)
|
233 |
with gr.Column():
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
|
11 |
+
# notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
|
12 |
+
notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
|
13 |
|
14 |
|
15 |
basic_component_values = [None] * 6
|
|
|
22 |
| [Vote](https://chat.lmsys.org/?arena) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
23 |
|
24 |
๐ This leaderboard is based on the following three benchmarks.
|
25 |
+
- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 130K+ user votes to compute Elo ratings.
|
26 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
27 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
28 |
|
|
|
228 |
with gr.Row():
|
229 |
with gr.Column():
|
230 |
gr.Markdown(
|
231 |
+
"#### Figure 3: Bootstrap of MLE Elo Estimates (1000 Rounds of Random Sampling)"
|
232 |
)
|
233 |
plot_3 = gr.Plot(p3, show_label=False)
|
234 |
with gr.Column():
|