Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
β’
67cd6fc
1
Parent(s):
a69553b
Better about's
Browse files- app.py +2 -2
- src/display/about.py +26 -5
- tasks_config/pt_config.yaml +32 -1
app.py
CHANGED
@@ -453,7 +453,7 @@ with demo:
|
|
453 |
],
|
454 |
submission_result,
|
455 |
)
|
456 |
-
|
457 |
with gr.Row():
|
458 |
with gr.Accordion("π Citation", open=False):
|
459 |
citation_button = gr.Textbox(
|
@@ -463,7 +463,7 @@ with demo:
|
|
463 |
elem_id="citation-button",
|
464 |
show_copy_button=True,
|
465 |
)
|
466 |
-
|
467 |
scheduler = BackgroundScheduler()
|
468 |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
469 |
scheduler.add_job(update_dynamic_files, "cron", minute=30) # launched every hour on the hour
|
|
|
453 |
],
|
454 |
submission_result,
|
455 |
)
|
456 |
+
""" #TODO: FIX CITATIONS
|
457 |
with gr.Row():
|
458 |
with gr.Accordion("π Citation", open=False):
|
459 |
citation_button = gr.Textbox(
|
|
|
463 |
elem_id="citation-button",
|
464 |
show_copy_button=True,
|
465 |
)
|
466 |
+
"""
|
467 |
scheduler = BackgroundScheduler()
|
468 |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
469 |
scheduler.add_job(update_dynamic_files, "cron", minute=30) # launched every hour on the hour
|
src/display/about.py
CHANGED
@@ -1,18 +1,30 @@
|
|
1 |
from src.display.utils import ModelType
|
2 |
from src.display.utils import Tasks
|
3 |
-
from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEADERBOARD_NAME, TRUST_REMOTE_CODE
|
4 |
|
5 |
LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
|
6 |
|
7 |
TITLE = F"""<h1 align="center" id="space-title">π {LEADERBOARD_NAME}</h1>"""
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
INTRODUCTION_TEXT = f"""
|
10 |
-
|
11 |
|
12 |
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
|
13 |
|
14 |
Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
|
15 |
-
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
|
|
|
|
16 |
"""
|
17 |
task_count = 0
|
18 |
TASKS_LIST= ""
|
@@ -26,11 +38,20 @@ for task in Tasks:
|
|
26 |
task = task.value
|
27 |
TASKS_PARAMETERS += f"- {task.col_name}: {task.few_shot}-shot, *{','.join(task.task_list)}* (`{task.metric}`)\n"
|
28 |
|
29 |
-
|
30 |
-
# Context
|
31 |
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
|
32 |
|
33 |
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
## How it works
|
36 |
|
|
|
1 |
from src.display.utils import ModelType
|
2 |
from src.display.utils import Tasks
|
3 |
+
from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEADERBOARD_NAME, TRUST_REMOTE_CODE, TASK_CONFIG
|
4 |
|
5 |
LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
|
6 |
|
7 |
TITLE = F"""<h1 align="center" id="space-title">π {LEADERBOARD_NAME}</h1>"""
|
8 |
|
9 |
+
GENERAL_DESCRIPTION = f"π The π {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots. "
|
10 |
+
|
11 |
+
SUPPORT_DESCRIPTION = ""
|
12 |
+
|
13 |
+
if 'readme' in TASK_CONFIG:
|
14 |
+
if 'general_description' in TASK_CONFIG['readme']:
|
15 |
+
GENERAL_DESCRIPTION = TASK_CONFIG['readme']['general_description']
|
16 |
+
if 'support_description' in TASK_CONFIG['readme']:
|
17 |
+
SUPPORT_DESCRIPTION = TASK_CONFIG['readme']['support_description']
|
18 |
+
|
19 |
INTRODUCTION_TEXT = f"""
|
20 |
+
{GENERAL_DESCRIPTION}
|
21 |
|
22 |
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
|
23 |
|
24 |
Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
|
25 |
+
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
26 |
+
|
27 |
+
{SUPPORT_DESCRIPTION}
|
28 |
"""
|
29 |
task_count = 0
|
30 |
TASKS_LIST= ""
|
|
|
38 |
task = task.value
|
39 |
TASKS_PARAMETERS += f"- {task.col_name}: {task.few_shot}-shot, *{','.join(task.task_list)}* (`{task.metric}`)\n"
|
40 |
|
41 |
+
ABOUT_DESCRIPTION_CONTEXT = f"""
|
|
|
42 |
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
|
43 |
|
44 |
With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
|
45 |
+
"""
|
46 |
+
|
47 |
+
if 'readme' in TASK_CONFIG:
|
48 |
+
if 'about_description' in TASK_CONFIG['readme']:
|
49 |
+
ABOUT_DESCRIPTION_CONTEXT = TASK_CONFIG['readme']['about_description']
|
50 |
+
|
51 |
+
LLM_BENCHMARKS_TEXT = f"""
|
52 |
+
# Context
|
53 |
+
|
54 |
+
{ABOUT_DESCRIPTION_CONTEXT}
|
55 |
|
56 |
## How it works
|
57 |
|
tasks_config/pt_config.yaml
CHANGED
@@ -10,6 +10,37 @@ config:
|
|
10 |
LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
|
11 |
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
|
12 |
TRUST_REMOTE_CODE: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
tasks:
|
14 |
enem_challenge:
|
15 |
benchmark: enem_challenge
|
@@ -138,7 +169,7 @@ tasks:
|
|
138 |
- sparrow_sentiment-2016-mozetic-por
|
139 |
- sparrow_sentiment-2018-brum-por
|
140 |
metric: f1_macro
|
141 |
-
few_shot:
|
142 |
limit: 500
|
143 |
baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
|
144 |
human_baseline: null
|
|
|
10 |
LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
|
11 |
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
|
12 |
TRUST_REMOTE_CODE: true
|
13 |
+
readme:
|
14 |
+
general_description: |
|
15 |
+
π The π Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
|
16 |
+
Large Language Models (LLMs) in the Portuguese language across a variety of tasks
|
17 |
+
and datasets.
|
18 |
+
The leaderboard is open to submissions of models from the community
|
19 |
+
and is designed to be a resource for researchers, practitioners, and enthusiasts
|
20 |
+
interested in the development and evaluation of LLMs for the Portuguese language.
|
21 |
+
If you have any questions, suggestions, or would like to contribute to the leaderboard,
|
22 |
+
please feel free to reach out at [@eduagarcia](https://linktr.ee/eduagarcia).
|
23 |
+
support_description: |
|
24 |
+
This leaderboard is made possible by the support of the
|
25 |
+
[Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
|
26 |
+
[Federal University of GoiΓ‘s (UFG)](https://international.ufg.br/).
|
27 |
+
about_description: |
|
28 |
+
The π Open PT-LLM Leaderboard is a benchmark for the evaluation of
|
29 |
+
Large Language Models (LLMs) in the Portuguese language.
|
30 |
+
|
31 |
+
The leaderboard is open to submissions of models from the community and
|
32 |
+
is designed to be a resource for researchers, practitioners, and enthusiasts interested
|
33 |
+
in the development and evaluation of LLMs for the Portuguese language.
|
34 |
+
|
35 |
+
Supported by the [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
|
36 |
+
[Federal University of GoiΓ‘s (UFG)](https://international.ufg.br/), this leaderboard
|
37 |
+
operates on a backend of Nvidia A100-80G GPUs. Evaluations are subject to
|
38 |
+
resource availability, which is not exclusive. Therefore, please be patient if
|
39 |
+
your model is in the queue. If you'd like to support the leaderboard, feel free to
|
40 |
+
reach out.
|
41 |
+
|
42 |
+
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with
|
43 |
+
portuguese benchmarks.
|
44 |
tasks:
|
45 |
enem_challenge:
|
46 |
benchmark: enem_challenge
|
|
|
169 |
- sparrow_sentiment-2016-mozetic-por
|
170 |
- sparrow_sentiment-2018-brum-por
|
171 |
metric: f1_macro
|
172 |
+
few_shot: 25
|
173 |
limit: 500
|
174 |
baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
|
175 |
human_baseline: null
|