Spaces:
Runtime error
Runtime error
Muennighoff
commited on
Commit
โข
dbfa15a
1
Parent(s):
3ffdc42
Add emojis
Browse files
app.py
CHANGED
@@ -3,8 +3,6 @@ import pandas as pd
|
|
3 |
from huggingface_hub import HfApi, hf_hub_download
|
4 |
from huggingface_hub.repocard import metadata_load
|
5 |
|
6 |
-
path = f"https://huggingface.co/api/spaces"
|
7 |
-
|
8 |
TASKS = [
|
9 |
"BitextMining",
|
10 |
"Classification",
|
@@ -185,15 +183,15 @@ def get_mteb_average(get_all_avgs=False):
|
|
185 |
cast_to_str=False
|
186 |
)
|
187 |
|
188 |
-
DATA_OVERALL.insert(1, "Average", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False))
|
189 |
-
DATA_OVERALL.insert(2, "Classification Average", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False))
|
190 |
-
DATA_OVERALL.insert(3, "Clustering Average", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False))
|
191 |
-
DATA_OVERALL.insert(4, "Pair Classification Average", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False))
|
192 |
-
DATA_OVERALL.insert(5, "Reranking Average", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False))
|
193 |
-
DATA_OVERALL.insert(6, "Retrieval Average", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False))
|
194 |
-
DATA_OVERALL.insert(7, "STS Average", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False))
|
195 |
-
DATA_OVERALL.insert(8, "Summarization Average", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False))
|
196 |
-
DATA_OVERALL.sort_values("Average", ascending=False, inplace=True)
|
197 |
# Start ranking from 1
|
198 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
199 |
|
@@ -207,7 +205,7 @@ def get_mteb_average(get_all_avgs=False):
|
|
207 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
208 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
209 |
|
210 |
-
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Average", "Classification Average", "Clustering Average", "Pair Classification Average", "Reranking Average", "Retrieval Average", "STS Average", "Summarization Average"]]
|
211 |
|
212 |
return DATA_OVERALL
|
213 |
|
@@ -216,19 +214,27 @@ block = gr.Blocks()
|
|
216 |
|
217 |
|
218 |
with block:
|
219 |
-
gr.Markdown(
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
222 |
with gr.Tabs():
|
223 |
with gr.TabItem("Overall"):
|
224 |
with gr.Row():
|
225 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
226 |
with gr.Row():
|
227 |
data_overall = gr.components.Dataframe(
|
228 |
DATA_OVERALL,
|
229 |
datatype=["markdown"] * len(DATA_OVERALL.columns) * 2,
|
230 |
type="pandas",
|
231 |
-
#col_count=(len(DATA_OVERALL.columns), "fixed"),
|
232 |
wrap=True,
|
233 |
)
|
234 |
with gr.Row():
|
@@ -236,7 +242,12 @@ with block:
|
|
236 |
data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
|
237 |
with gr.TabItem("BitextMining"):
|
238 |
with gr.Row():
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
240 |
with gr.Row():
|
241 |
data_bitext_mining = gr.components.Dataframe(
|
242 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
@@ -253,7 +264,12 @@ with block:
|
|
253 |
with gr.TabItem("Classification"):
|
254 |
with gr.TabItem("English"):
|
255 |
with gr.Row():
|
256 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
257 |
with gr.Row():
|
258 |
data_classification_en = gr.components.Dataframe(
|
259 |
DATA_CLASSIFICATION_EN,
|
@@ -274,7 +290,12 @@ with block:
|
|
274 |
)
|
275 |
with gr.TabItem("Multilingual"):
|
276 |
with gr.Row():
|
277 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
278 |
with gr.Row():
|
279 |
data_classification = gr.components.Dataframe(
|
280 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
@@ -290,7 +311,12 @@ with block:
|
|
290 |
)
|
291 |
with gr.TabItem("Clustering"):
|
292 |
with gr.Row():
|
293 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
294 |
with gr.Row():
|
295 |
data_clustering = gr.components.Dataframe(
|
296 |
DATA_CLUSTERING,
|
@@ -308,7 +334,12 @@ with block:
|
|
308 |
)
|
309 |
with gr.TabItem("Pair Classification"):
|
310 |
with gr.Row():
|
311 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
312 |
with gr.Row():
|
313 |
data_pair_classification = gr.components.Dataframe(
|
314 |
DATA_PAIR_CLASSIFICATION,
|
@@ -318,7 +349,7 @@ with block:
|
|
318 |
)
|
319 |
with gr.Row():
|
320 |
data_run = gr.Button("Refresh")
|
321 |
-
task_pair_classification = gr.Variable(value="
|
322 |
data_run.click(
|
323 |
get_mteb_data,
|
324 |
inputs=[task_pair_classification],
|
@@ -326,7 +357,12 @@ with block:
|
|
326 |
)
|
327 |
with gr.TabItem("Retrieval"):
|
328 |
with gr.Row():
|
329 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
330 |
with gr.Row():
|
331 |
data_retrieval = gr.components.Dataframe(
|
332 |
DATA_RETRIEVAL,
|
@@ -341,7 +377,12 @@ with block:
|
|
341 |
)
|
342 |
with gr.TabItem("Reranking"):
|
343 |
with gr.Row():
|
344 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
345 |
with gr.Row():
|
346 |
data_reranking = gr.components.Dataframe(
|
347 |
DATA_RERANKING,
|
@@ -359,7 +400,12 @@ with block:
|
|
359 |
with gr.TabItem("STS"):
|
360 |
with gr.TabItem("English"):
|
361 |
with gr.Row():
|
362 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
363 |
with gr.Row():
|
364 |
data_sts_en = gr.components.Dataframe(
|
365 |
DATA_STS_EN,
|
@@ -378,7 +424,12 @@ with block:
|
|
378 |
)
|
379 |
with gr.TabItem("Multilingual"):
|
380 |
with gr.Row():
|
381 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
382 |
with gr.Row():
|
383 |
data_sts = gr.components.Dataframe(
|
384 |
datatype=["markdown"] * 50, # hack when we don't know how many columns
|
@@ -390,7 +441,12 @@ with block:
|
|
390 |
data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
391 |
with gr.TabItem("Summarization"):
|
392 |
with gr.Row():
|
393 |
-
gr.Markdown("""
|
|
|
|
|
|
|
|
|
|
|
394 |
with gr.Row():
|
395 |
data_summarization = gr.components.Dataframe(
|
396 |
DATA_SUMMARIZATION,
|
@@ -406,13 +462,15 @@ with block:
|
|
406 |
inputs=[task_summarization],
|
407 |
outputs=data_summarization,
|
408 |
)
|
409 |
-
#
|
|
|
410 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
411 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
412 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
413 |
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
414 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
415 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
|
|
416 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
417 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
418 |
|
|
|
3 |
from huggingface_hub import HfApi, hf_hub_download
|
4 |
from huggingface_hub.repocard import metadata_load
|
5 |
|
|
|
|
|
6 |
TASKS = [
|
7 |
"BitextMining",
|
8 |
"Classification",
|
|
|
183 |
cast_to_str=False
|
184 |
)
|
185 |
|
186 |
+
DATA_OVERALL.insert(1, f"Average ({len(TASK_LIST_EN)} datasets)", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False))
|
187 |
+
DATA_OVERALL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False))
|
188 |
+
DATA_OVERALL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False))
|
189 |
+
DATA_OVERALL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False))
|
190 |
+
DATA_OVERALL.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False))
|
191 |
+
DATA_OVERALL.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False))
|
192 |
+
DATA_OVERALL.insert(7, f"STS Average ({len(TASK_LIST_STS)} datasets)", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False))
|
193 |
+
DATA_OVERALL.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False))
|
194 |
+
DATA_OVERALL.sort_values(f"Average ({len(TASK_LIST_EN)} datasets)", ascending=False, inplace=True)
|
195 |
# Start ranking from 1
|
196 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
197 |
|
|
|
205 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
206 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
207 |
|
208 |
+
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
209 |
|
210 |
return DATA_OVERALL
|
211 |
|
|
|
214 |
|
215 |
|
216 |
with block:
|
217 |
+
gr.Markdown(f"""
|
218 |
+
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> ๐ค
|
219 |
+
|
220 |
+
- **Total Scores**: TODO
|
221 |
+
- **Total Models**: {len(DATA_OVERALL)}
|
222 |
+
- **Total Users**: TODO
|
223 |
+
""")
|
224 |
with gr.Tabs():
|
225 |
with gr.TabItem("Overall"):
|
226 |
with gr.Row():
|
227 |
+
gr.Markdown("""
|
228 |
+
**Overall MTEB English leaderboard ๐ฎ**
|
229 |
+
|
230 |
+
- **Metric:** Various, refer to task tabs
|
231 |
+
- **Languages:** English, refer to task tabs for others
|
232 |
+
""")
|
233 |
with gr.Row():
|
234 |
data_overall = gr.components.Dataframe(
|
235 |
DATA_OVERALL,
|
236 |
datatype=["markdown"] * len(DATA_OVERALL.columns) * 2,
|
237 |
type="pandas",
|
|
|
238 |
wrap=True,
|
239 |
)
|
240 |
with gr.Row():
|
|
|
242 |
data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
|
243 |
with gr.TabItem("BitextMining"):
|
244 |
with gr.Row():
|
245 |
+
gr.Markdown("""
|
246 |
+
**Bitext Mining Leaderboard ๐**
|
247 |
+
|
248 |
+
- **Metric:** Accuracy (accuracy)
|
249 |
+
- **Languages:** 117
|
250 |
+
""")
|
251 |
with gr.Row():
|
252 |
data_bitext_mining = gr.components.Dataframe(
|
253 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
|
|
264 |
with gr.TabItem("Classification"):
|
265 |
with gr.TabItem("English"):
|
266 |
with gr.Row():
|
267 |
+
gr.Markdown("""
|
268 |
+
**Classification Leaderboard โค๏ธ**
|
269 |
+
|
270 |
+
- **Metric:** Accuracy (accuracy)
|
271 |
+
- **Languages:** English
|
272 |
+
""")
|
273 |
with gr.Row():
|
274 |
data_classification_en = gr.components.Dataframe(
|
275 |
DATA_CLASSIFICATION_EN,
|
|
|
290 |
)
|
291 |
with gr.TabItem("Multilingual"):
|
292 |
with gr.Row():
|
293 |
+
gr.Markdown("""
|
294 |
+
**Classification Multilingual Leaderboard ๐๐๐**
|
295 |
+
|
296 |
+
- **Metric:** Accuracy (accuracy)
|
297 |
+
- **Languages:** 51
|
298 |
+
""")
|
299 |
with gr.Row():
|
300 |
data_classification = gr.components.Dataframe(
|
301 |
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
|
|
311 |
)
|
312 |
with gr.TabItem("Clustering"):
|
313 |
with gr.Row():
|
314 |
+
gr.Markdown("""
|
315 |
+
**Clustering Leaderboard โจ**
|
316 |
+
|
317 |
+
- **Metric:** Validity Measure (v_measure)
|
318 |
+
- **Languages:** English
|
319 |
+
""")
|
320 |
with gr.Row():
|
321 |
data_clustering = gr.components.Dataframe(
|
322 |
DATA_CLUSTERING,
|
|
|
334 |
)
|
335 |
with gr.TabItem("Pair Classification"):
|
336 |
with gr.Row():
|
337 |
+
gr.Markdown("""
|
338 |
+
**Pair Classification Leaderboard ๐ญ**
|
339 |
+
|
340 |
+
- **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
|
341 |
+
- **Languages:** English
|
342 |
+
""")
|
343 |
with gr.Row():
|
344 |
data_pair_classification = gr.components.Dataframe(
|
345 |
DATA_PAIR_CLASSIFICATION,
|
|
|
349 |
)
|
350 |
with gr.Row():
|
351 |
data_run = gr.Button("Refresh")
|
352 |
+
task_pair_classification = gr.Variable(value="PairClassification")
|
353 |
data_run.click(
|
354 |
get_mteb_data,
|
355 |
inputs=[task_pair_classification],
|
|
|
357 |
)
|
358 |
with gr.TabItem("Retrieval"):
|
359 |
with gr.Row():
|
360 |
+
gr.Markdown("""
|
361 |
+
**Retrieval Leaderboard ๐**
|
362 |
+
|
363 |
+
- **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
|
364 |
+
- **Languages:** English
|
365 |
+
""")
|
366 |
with gr.Row():
|
367 |
data_retrieval = gr.components.Dataframe(
|
368 |
DATA_RETRIEVAL,
|
|
|
377 |
)
|
378 |
with gr.TabItem("Reranking"):
|
379 |
with gr.Row():
|
380 |
+
gr.Markdown("""
|
381 |
+
**Reranking Leaderboard ๐ฅ**
|
382 |
+
|
383 |
+
- **Metric:** Mean Average Precision (MAP)
|
384 |
+
- **Languages:** English
|
385 |
+
""")
|
386 |
with gr.Row():
|
387 |
data_reranking = gr.components.Dataframe(
|
388 |
DATA_RERANKING,
|
|
|
400 |
with gr.TabItem("STS"):
|
401 |
with gr.TabItem("English"):
|
402 |
with gr.Row():
|
403 |
+
gr.Markdown("""
|
404 |
+
**STS Leaderboard ๐ค**
|
405 |
+
|
406 |
+
- **Metric:** Spearman correlation based on cosine similarity
|
407 |
+
- **Languages:** English
|
408 |
+
""")
|
409 |
with gr.Row():
|
410 |
data_sts_en = gr.components.Dataframe(
|
411 |
DATA_STS_EN,
|
|
|
424 |
)
|
425 |
with gr.TabItem("Multilingual"):
|
426 |
with gr.Row():
|
427 |
+
gr.Markdown("""
|
428 |
+
**STS Multilingual Leaderboard ๐ฝ**
|
429 |
+
|
430 |
+
- **Metric:** Spearman correlation based on cosine similarity
|
431 |
+
- **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish
|
432 |
+
""")
|
433 |
with gr.Row():
|
434 |
data_sts = gr.components.Dataframe(
|
435 |
datatype=["markdown"] * 50, # hack when we don't know how many columns
|
|
|
441 |
data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
442 |
with gr.TabItem("Summarization"):
|
443 |
with gr.Row():
|
444 |
+
gr.Markdown("""
|
445 |
+
**Summarization Leaderboard ๐**
|
446 |
+
|
447 |
+
- **Metric:** Spearman correlation based on cosine similarity
|
448 |
+
- **Languages:** English
|
449 |
+
""")
|
450 |
with gr.Row():
|
451 |
data_summarization = gr.components.Dataframe(
|
452 |
DATA_SUMMARIZATION,
|
|
|
462 |
inputs=[task_summarization],
|
463 |
outputs=data_summarization,
|
464 |
)
|
465 |
+
# Running the function on page load in addition to when the button is clicked
|
466 |
+
# This is optional - If deactivated the data created loaded at "Build time" is shown like for Overall tab
|
467 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
468 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
469 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
470 |
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
471 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
472 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
473 |
+
block.load(get_mteb_data, inputs=[task_sts_en], outputs=data_sts_en)
|
474 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
475 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
476 |
|