Open-LLM-results-pr

Runtime error

App Files Files Community

eduagarcia commited on Mar 20

Commit

9b95b87

•

1 Parent(s): b787f43

Adapt code to work with the Open Portuguese LLM leaderboard

Browse files

Files changed (3) hide show

app.py +9 -7
functions.py +98 -67
openllm.py +44 -0

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import time
-os.system("wget https://raw.githubusercontent.com/Weyaxi/scrape-open-llm-leaderboard/main/openllm.py")
 from huggingface_hub import HfApi, HfFileSystem
 import time
 import pandas as pd
@@ -21,17 +20,20 @@ fs = HfFileSystem()
 def refresh(how_much=3600): # default to 1 hour
   time.sleep(how_much)
   try:
-      api.restart_space(repo_id="Weyaxi/leaderboard-results-to-modelcard")
   except Exception as e:
       print(f"Error while scraping leaderboard, trying again... {e}")
       refresh(600) # 10 minutes if any error happens
-gradio_title="🧐 Open LLM Leaderboard Results PR Opener"
-gradio_desc= """🎯 This tool's aim is to provide [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) results in the model card.
 ## 💭 What Does This Tool Do:
-- This tool adds the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) result of your model at the end of your model card.
 - This tool also adds evaluation results as your model's metadata to showcase the evaluation results as a widget.
@@ -41,9 +43,9 @@ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://hugg
 ## 🤝 Acknowledgements
-- Special thanks to [Clémentine Fourrier (clefourrier)](https://huggingface.co/clefourrier) for her help and contributions to the code.
-- Special thanks to [Lucain Pouget (Wauplin)](https://huggingface.co/Wauplin) for assisting with the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
 """
 with gr.Blocks() as demo:

 import os
 import time
 from huggingface_hub import HfApi, HfFileSystem
 import time
 import pandas as pd
 def refresh(how_much=3600): # default to 1 hour
   time.sleep(how_much)
   try:
+      api.restart_space(repo_id="eduagarcia-temp/portuguese-leaderboard-results-to-modelcard")
   except Exception as e:
       print(f"Error while scraping leaderboard, trying again... {e}")
       refresh(600) # 10 minutes if any error happens
+gradio_title="🧐 Open Portuguese LLM Leaderboard Results PR Opener"
+gradio_desc= """
+This a fork of the [🧐 Open LLM Leaderboard Results PR Opener
+](https://huggingface.co/spaces/Weyaxi/leaderboard-results-to-modelcard) from [@Weyaxi](https://huggingface.co/Weyaxi) modfied to work with the [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
+🎯 This tool's aim is to provide [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard) results in the model card.
 ## 💭 What Does This Tool Do:
+- This tool adds the [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard) result of your model at the end of your model card.
 - This tool also adds evaluation results as your model's metadata to showcase the evaluation results as a widget.
 ## 🤝 Acknowledgements
+- Thanks to [Yağız Çalık (Weyaxi)](https://huggingface.co/Weyaxi) for creating the original [🧐 Open LLM Leaderboard Results PR Opener
+](https://huggingface.co/spaces/Weyaxi/leaderboard-results-to-modelcard) tool.
 """
 with gr.Blocks() as demo:

functions.py CHANGED Viewed

@@ -15,11 +15,11 @@ finished_models = get_datas(data)
 df = pd.DataFrame(finished_models)
 desc = """
-This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr
-The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
-If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions
 """
 def search(df, value):
@@ -28,84 +28,115 @@ def search(df, value):
 def get_details_url(repo):
-   author, model = repo.split("/")
-   return f"https://huggingface.co/datasets/open-llm-leaderboard/details_{author}__{model}"
 def get_query_url(repo):
-  return f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"
 def get_task_summary(results):
   return {
-      "ARC":
-          {"dataset_type":"ai2_arc",
-          "dataset_name":"AI2 Reasoning Challenge (25-Shot)",
-          "metric_type":"acc_norm",
-          "metric_value":results["ARC"],
-          "dataset_config":"ARC-Challenge",
-          "dataset_split":"test",
           "dataset_revision":None,
-          "dataset_args":{"num_few_shot": 25},
-          "metric_name":"normalized accuracy"
           },
-      "HellaSwag":
-          {"dataset_type":"hellaswag",
-          "dataset_name":"HellaSwag (10-Shot)",
-          "metric_type":"acc_norm",
-          "metric_value":results["HellaSwag"],
-          "dataset_config":None,
-          "dataset_split":"validation",
           "dataset_revision":None,
-          "dataset_args":{"num_few_shot": 10},
-          "metric_name":"normalized accuracy"
           },
-      "MMLU":
-      {
-          "dataset_type":"cais/mmlu",
-          "dataset_name":"MMLU (5-Shot)",
           "metric_type":"acc",
-          "metric_value":results["MMLU"],
-          "dataset_config":"all",
-          "dataset_split":"test",
           "dataset_revision":None,
-          "dataset_args":{"num_few_shot": 5},
           "metric_name":"accuracy"
-      },
-      "TruthfulQA":
-      {
-          "dataset_type":"truthful_qa",
-          "dataset_name":"TruthfulQA (0-shot)",
-          "metric_type":"mc2",
-          "metric_value":results["TruthfulQA"],
-          "dataset_config":"multiple_choice",
-          "dataset_split":"validation",
           "dataset_revision":None,
-          "dataset_args":{"num_few_shot": 0},
-          "metric_name":None
-      },
-      "Winogrande":
-      {
-          "dataset_type":"winogrande",
-          "dataset_name":"Winogrande (5-shot)",
-          "metric_type":"acc",
-          "metric_value":results["Winogrande"],
-          "dataset_config":"winogrande_xl",
-          "dataset_split":"validation",
-          "dataset_args":{"num_few_shot": 5},
-          "metric_name":"accuracy"
-      },
-      "GSM8K":
-      {
-          "dataset_type":"gsm8k",
-          "dataset_name":"GSM8k (5-shot)",
-          "metric_type":"acc",
-          "metric_value":results["GSM8K"],
-          "dataset_config":"main",
           "dataset_split":"test",
-          "dataset_args":{"num_few_shot": 5},
-          "metric_name":"accuracy"
-      }
   }
@@ -118,7 +149,7 @@ def get_eval_results(repo):
   md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
   text = f"""
-# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
 Detailed results can be found [here]({get_details_url(repo)})
 {md_writer.dumps()}
@@ -130,7 +161,7 @@ def get_edited_yaml_readme(repo, token: str | None):
   card = ModelCard.load(repo, token=token)
   results = search(df, repo)
-  common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open LLM Leaderboard", "source_url": f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"}
   tasks_results = get_task_summary(results)

 df = pd.DataFrame(finished_models)
 desc = """
+This is an automated PR created with https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard
+The purpose of this PR is to add evaluation results from the Open Portuguese LLM Leaderboard to your model card.
+If you encounter any issues, please report them to https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard/discussions
 """
 def search(df, value):
 def get_details_url(repo):
+   #author, model = repo.split("/")
+   return f"https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_raw_results/tree/main/{repo}"
 def get_query_url(repo):
+  return f"https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query={repo}"
 def get_task_summary(results):
   return {
+      "ENEM":
+          {"dataset_type":"enem_challenge",
+          "dataset_name":"ENEM Challenge",
+          "metric_type":"acc",
+          "metric_value":results["ENEM"],
+          "dataset_config": None,
+          "dataset_split":"train",
           "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 3},
+          "metric_name":"accuracy"
           },
+      "BLUEX":
+          {"dataset_type":"bluex",
+          "dataset_name":"BLUEX",
+          "metric_type":"acc",
+          "metric_value":results["BLUEX"],
+          "dataset_config": None,
+          "dataset_split":"train",
           "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 3},
+          "metric_name":"accuracy"
           },
+      "OAB Exams":
+          {"dataset_type":"oab_exams",
+          "dataset_name":"OAB Exams",
           "metric_type":"acc",
+          "metric_value":results["OAB Exams"],
+          "dataset_config": None,
+          "dataset_split":"train",
           "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 3},
           "metric_name":"accuracy"
+          },
+      "ASSIN2 RTE":
+          {"dataset_type":"assin2_rte",
+          "dataset_name":"ASSIN2 RTE",
+          "metric_type":"f1_macro",
+          "metric_value":results["ASSIN2 RTE"],
+          "dataset_config": None,
+          "dataset_split":"test",
           "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 15},
+          "metric_name":"f1-macro"
+          },
+      "ASSIN2 STS":
+          {"dataset_type":"assin2_sts",
+          "dataset_name":"ASSIN2 STS",
+          "metric_type":"pearson",
+          "metric_value":results["ASSIN2 STS"],
+          "dataset_config": None,
           "dataset_split":"test",
+          "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 15},
+          "metric_name":"pearson"
+          },
+      "FAQUAD NLI":
+          {"dataset_type":"fquad_nli",
+          "dataset_name":"FAQUAD NLI",
+          "metric_type":"f1_macro",
+          "metric_value":results["FAQUAD NLI"],
+          "dataset_config": None,
+          "dataset_split":"test",
+          "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 15},
+          "metric_name":"f1-macro"
+          },
+      "HateBR":
+          {"dataset_type":"hatebr_offensive",
+          "dataset_name":"HateBR",
+          "metric_type":"f1_macro",
+          "metric_value":results["HateBR"],
+          "dataset_config": None,
+          "dataset_split":"test",
+          "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 25},
+          "metric_name":"f1-macro"
+          },
+      "PT Hate Speech":
+          {"dataset_type":"portuguese_hate_speech",
+          "dataset_name":"PT Hate Speech",
+          "metric_type":"f1_macro",
+          "metric_value":results["PT Hate Speech"],
+          "dataset_config": None,
+          "dataset_split":"test",
+          "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 25},
+          "metric_name":"f1-macro"
+          },
+      "tweetSentBR":
+          {"dataset_type":"tweetsentbr",
+          "dataset_name":"tweetSentBR",
+          "metric_type":"f1_macro",
+          "metric_value":results["tweetSentBR"],
+          "dataset_config": None,
+          "dataset_split":"test",
+          "dataset_revision":None,
+          "dataset_args":{"num_few_shot": 25},
+          "metric_name":"f1-macro"
+          }
   }
   md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
   text = f"""
+# [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
 Detailed results can be found [here]({get_details_url(repo)})
 {md_writer.dumps()}
   card = ModelCard.load(repo, token=token)
   results = search(df, repo)
+  common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open Portuguese LLM Leaderboard", "source_url": get_query_url(repo)}
   tasks_results = get_task_summary(results)

openllm.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import json
+def get_json_format_data():
+    url = 'https://eduagarcia-open-pt-llm-leaderboard.hf.space/'
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    script_elements = soup.find_all('script')
+    json_format_data = json.loads(str(script_elements[1])[31:-10])
+    return json_format_data
+def get_datas(data):
+    for component_index in range(10, 50, 1):  # component_index sometimes changes when they update the space, we can use this "for" loop to avoid changing component index manually
+        try:
+            result_list = []
+            i = 0
+            while True:
+                try:
+                    results = data['components'][component_index]['props']['value']['data'][i]
+                    columns = data['components'][component_index]['props']['headers']
+                    try:
+                        results_json = {"T": results[0], "Model": results[-1]}
+                        if len(columns) < 15: # If there are less than 15 columns (this number can definetly change), we know that we are trying wrong component index, so breaking loop to try next component index.
+                            break
+                        for col_index, col_name in enumerate(columns[2:-1], start=2):
+                            results_json[col_name] = results[col_index]
+                    except IndexError:  # Wrong component index, so breaking loop to try next component index. (NOTE: More than one component index can give you some results but we must find the right component index to get all results we want.)
+                        break
+                    result_list.append(results_json)
+                    i += 1
+                except IndexError:  # No rows to extract so return the list (We know it is the right component index because we didn't break out of loop on the other exception.)
+                    return result_list
+        except (KeyError, TypeError):
+            continue
+    return result_list