open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

eduagarcia commited on Feb 15

Commit

67cd6fc

•

1 Parent(s): a69553b

Better about's

Browse files

Files changed (3) hide show

app.py +2 -2
src/display/about.py +26 -5
tasks_config/pt_config.yaml +32 -1

app.py CHANGED Viewed

@@ -453,7 +453,7 @@ with demo:
                 ],
                 submission_result,
             )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
@@ -463,7 +463,7 @@ with demo:
                 elem_id="citation-button",
                 show_copy_button=True,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
 scheduler.add_job(update_dynamic_files, "cron", minute=30) # launched every hour on the hour

                 ],
                 submission_result,
             )
+    """ #TODO: FIX CITATIONS
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 elem_id="citation-button",
                 show_copy_button=True,
             )
+    """
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
 scheduler.add_job(update_dynamic_files, "cron", minute=30) # launched every hour on the hour

src/display/about.py CHANGED Viewed

@@ -1,18 +1,30 @@
 from src.display.utils import ModelType
 from src.display.utils import Tasks
-from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEADERBOARD_NAME, TRUST_REMOTE_CODE
 LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
 TITLE = F"""<h1 align="center" id="space-title">🚀 {LEADERBOARD_NAME}</h1>"""
 INTRODUCTION_TEXT = f"""
-📐 The 🚀 {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots.
 This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with different benchmarks.
 Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
-The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
 """
 task_count = 0
 TASKS_LIST= ""
@@ -26,11 +38,20 @@ for task in Tasks:
       task = task.value
       TASKS_PARAMETERS += f"- {task.col_name}: {task.few_shot}-shot, *{','.join(task.task_list)}* (`{task.metric}`)\n"
-LLM_BENCHMARKS_TEXT = f"""
-# Context
 This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with different benchmarks.
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
 ## How it works

 from src.display.utils import ModelType
 from src.display.utils import Tasks
+from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEADERBOARD_NAME, TRUST_REMOTE_CODE, TASK_CONFIG
 LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
 TITLE = F"""<h1 align="center" id="space-title">🚀 {LEADERBOARD_NAME}</h1>"""
+GENERAL_DESCRIPTION = f"📐 The 🚀 {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots. "
+SUPPORT_DESCRIPTION = ""
+if 'readme' in TASK_CONFIG:
+      if 'general_description' in TASK_CONFIG['readme']:
+            GENERAL_DESCRIPTION = TASK_CONFIG['readme']['general_description']
+      if 'support_description' in TASK_CONFIG['readme']:
+            SUPPORT_DESCRIPTION = TASK_CONFIG['readme']['support_description']
 INTRODUCTION_TEXT = f"""
+{GENERAL_DESCRIPTION}
 This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with different benchmarks.
 Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
+The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
+{SUPPORT_DESCRIPTION}
 """
 task_count = 0
 TASKS_LIST= ""
       task = task.value
       TASKS_PARAMETERS += f"- {task.col_name}: {task.few_shot}-shot, *{','.join(task.task_list)}* (`{task.metric}`)\n"
+ABOUT_DESCRIPTION_CONTEXT = f"""
 This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with different benchmarks.
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
+"""
+if 'readme' in TASK_CONFIG:
+      if 'about_description' in TASK_CONFIG['readme']:
+            ABOUT_DESCRIPTION_CONTEXT = TASK_CONFIG['readme']['about_description']
+LLM_BENCHMARKS_TEXT = f"""
+# Context
+{ABOUT_DESCRIPTION_CONTEXT}
 ## How it works

tasks_config/pt_config.yaml CHANGED Viewed

@@ -10,6 +10,37 @@ config:
   LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
   GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
   TRUST_REMOTE_CODE: true
 tasks:
   enem_challenge:
     benchmark: enem_challenge
@@ -138,7 +169,7 @@ tasks:
     - sparrow_sentiment-2016-mozetic-por
     - sparrow_sentiment-2018-brum-por
     metric: f1_macro
-    few_shot: 15
     limit: 500
     baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
     human_baseline: null

   LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
   GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
   TRUST_REMOTE_CODE: true
+readme:
+  general_description: |
+    📐 The 🚀 Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
+    Large Language Models (LLMs) in the Portuguese language across a variety of tasks
+    and datasets.
+    The leaderboard is open to submissions of models from the community
+    and is designed to be a resource for researchers, practitioners, and enthusiasts
+    interested in the development and evaluation of LLMs for the Portuguese language.
+    If you have any questions, suggestions, or would like to contribute to the leaderboard,
+    please feel free to reach out at [@eduagarcia](https://linktr.ee/eduagarcia).
+  support_description: |
+    This leaderboard is made possible by the support of the
+    [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
+    [Federal University of Goiás (UFG)](https://international.ufg.br/).
+  about_description: |
+    The 🚀 Open PT-LLM Leaderboard is a benchmark for the evaluation of
+    Large Language Models (LLMs) in the Portuguese language.
+    The leaderboard is open to submissions of models from the community and
+    is designed to be a resource for  researchers, practitioners, and enthusiasts interested
+    in the development and evaluation of LLMs for the Portuguese language.
+    Supported by the [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
+    [Federal University of Goiás (UFG)](https://international.ufg.br/), this leaderboard
+    operates on a backend of Nvidia A100-80G GPUs. Evaluations are subject to
+    resource availability, which is not exclusive. Therefore, please be patient if
+    your model is in the queue. If you'd like to support the leaderboard, feel free to
+    reach out.
+    This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with
+    portuguese benchmarks.
 tasks:
   enem_challenge:
     benchmark: enem_challenge
     - sparrow_sentiment-2016-mozetic-por
     - sparrow_sentiment-2018-brum-por
     metric: f1_macro
+    few_shot: 25
     limit: 500
     baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
     human_baseline: null