eduagarcia commited on
Commit
9b95b87
1 Parent(s): b787f43

Adapt code to work with the Open Portuguese LLM leaderboard

Browse files
Files changed (3) hide show
  1. app.py +9 -7
  2. functions.py +98 -67
  3. openllm.py +44 -0
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  import time
3
- os.system("wget https://raw.githubusercontent.com/Weyaxi/scrape-open-llm-leaderboard/main/openllm.py")
4
  from huggingface_hub import HfApi, HfFileSystem
5
  import time
6
  import pandas as pd
@@ -21,17 +20,20 @@ fs = HfFileSystem()
21
  def refresh(how_much=3600): # default to 1 hour
22
  time.sleep(how_much)
23
  try:
24
- api.restart_space(repo_id="Weyaxi/leaderboard-results-to-modelcard")
25
  except Exception as e:
26
  print(f"Error while scraping leaderboard, trying again... {e}")
27
  refresh(600) # 10 minutes if any error happens
28
 
29
- gradio_title="🧐 Open LLM Leaderboard Results PR Opener"
30
- gradio_desc= """🎯 This tool's aim is to provide [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) results in the model card.
 
 
 
31
 
32
  ## 💭 What Does This Tool Do:
33
 
34
- - This tool adds the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) result of your model at the end of your model card.
35
 
36
  - This tool also adds evaluation results as your model's metadata to showcase the evaluation results as a widget.
37
 
@@ -41,9 +43,9 @@ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://hugg
41
 
42
  ## 🤝 Acknowledgements
43
 
44
- - Special thanks to [Clémentine Fourrier (clefourrier)](https://huggingface.co/clefourrier) for her help and contributions to the code.
 
45
 
46
- - Special thanks to [Lucain Pouget (Wauplin)](https://huggingface.co/Wauplin) for assisting with the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
47
  """
48
 
49
  with gr.Blocks() as demo:
 
1
  import os
2
  import time
 
3
  from huggingface_hub import HfApi, HfFileSystem
4
  import time
5
  import pandas as pd
 
20
  def refresh(how_much=3600): # default to 1 hour
21
  time.sleep(how_much)
22
  try:
23
+ api.restart_space(repo_id="eduagarcia-temp/portuguese-leaderboard-results-to-modelcard")
24
  except Exception as e:
25
  print(f"Error while scraping leaderboard, trying again... {e}")
26
  refresh(600) # 10 minutes if any error happens
27
 
28
+ gradio_title="🧐 Open Portuguese LLM Leaderboard Results PR Opener"
29
+ gradio_desc= """
30
+ This a fork of the [🧐 Open LLM Leaderboard Results PR Opener
31
+ ](https://huggingface.co/spaces/Weyaxi/leaderboard-results-to-modelcard) from [@Weyaxi](https://huggingface.co/Weyaxi) modfied to work with the [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard).
32
+ 🎯 This tool's aim is to provide [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard) results in the model card.
33
 
34
  ## 💭 What Does This Tool Do:
35
 
36
+ - This tool adds the [Open Portuguese LLM Leaderboard](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard) result of your model at the end of your model card.
37
 
38
  - This tool also adds evaluation results as your model's metadata to showcase the evaluation results as a widget.
39
 
 
43
 
44
  ## 🤝 Acknowledgements
45
 
46
+ - Thanks to [Yağız Çalık (Weyaxi)](https://huggingface.co/Weyaxi) for creating the original [🧐 Open LLM Leaderboard Results PR Opener
47
+ ](https://huggingface.co/spaces/Weyaxi/leaderboard-results-to-modelcard) tool.
48
 
 
49
  """
50
 
51
  with gr.Blocks() as demo:
functions.py CHANGED
@@ -15,11 +15,11 @@ finished_models = get_datas(data)
15
  df = pd.DataFrame(finished_models)
16
 
17
  desc = """
18
- This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr
19
 
20
- The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
21
 
22
- If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions
23
  """
24
 
25
  def search(df, value):
@@ -28,84 +28,115 @@ def search(df, value):
28
 
29
 
30
  def get_details_url(repo):
31
- author, model = repo.split("/")
32
- return f"https://huggingface.co/datasets/open-llm-leaderboard/details_{author}__{model}"
33
 
34
 
35
  def get_query_url(repo):
36
- return f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"
37
 
38
 
39
  def get_task_summary(results):
40
  return {
41
- "ARC":
42
- {"dataset_type":"ai2_arc",
43
- "dataset_name":"AI2 Reasoning Challenge (25-Shot)",
44
- "metric_type":"acc_norm",
45
- "metric_value":results["ARC"],
46
- "dataset_config":"ARC-Challenge",
47
- "dataset_split":"test",
48
  "dataset_revision":None,
49
- "dataset_args":{"num_few_shot": 25},
50
- "metric_name":"normalized accuracy"
51
  },
52
- "HellaSwag":
53
- {"dataset_type":"hellaswag",
54
- "dataset_name":"HellaSwag (10-Shot)",
55
- "metric_type":"acc_norm",
56
- "metric_value":results["HellaSwag"],
57
- "dataset_config":None,
58
- "dataset_split":"validation",
59
  "dataset_revision":None,
60
- "dataset_args":{"num_few_shot": 10},
61
- "metric_name":"normalized accuracy"
62
  },
63
- "MMLU":
64
- {
65
- "dataset_type":"cais/mmlu",
66
- "dataset_name":"MMLU (5-Shot)",
67
  "metric_type":"acc",
68
- "metric_value":results["MMLU"],
69
- "dataset_config":"all",
70
- "dataset_split":"test",
71
  "dataset_revision":None,
72
- "dataset_args":{"num_few_shot": 5},
73
  "metric_name":"accuracy"
74
- },
75
- "TruthfulQA":
76
- {
77
- "dataset_type":"truthful_qa",
78
- "dataset_name":"TruthfulQA (0-shot)",
79
- "metric_type":"mc2",
80
- "metric_value":results["TruthfulQA"],
81
- "dataset_config":"multiple_choice",
82
- "dataset_split":"validation",
83
  "dataset_revision":None,
84
- "dataset_args":{"num_few_shot": 0},
85
- "metric_name":None
86
- },
87
- "Winogrande":
88
- {
89
- "dataset_type":"winogrande",
90
- "dataset_name":"Winogrande (5-shot)",
91
- "metric_type":"acc",
92
- "metric_value":results["Winogrande"],
93
- "dataset_config":"winogrande_xl",
94
- "dataset_split":"validation",
95
- "dataset_args":{"num_few_shot": 5},
96
- "metric_name":"accuracy"
97
- },
98
- "GSM8K":
99
- {
100
- "dataset_type":"gsm8k",
101
- "dataset_name":"GSM8k (5-shot)",
102
- "metric_type":"acc",
103
- "metric_value":results["GSM8K"],
104
- "dataset_config":"main",
105
  "dataset_split":"test",
106
- "dataset_args":{"num_few_shot": 5},
107
- "metric_name":"accuracy"
108
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  }
110
 
111
 
@@ -118,7 +149,7 @@ def get_eval_results(repo):
118
  md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
119
 
120
  text = f"""
121
- # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
122
  Detailed results can be found [here]({get_details_url(repo)})
123
 
124
  {md_writer.dumps()}
@@ -130,7 +161,7 @@ def get_edited_yaml_readme(repo, token: str | None):
130
  card = ModelCard.load(repo, token=token)
131
  results = search(df, repo)
132
 
133
- common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open LLM Leaderboard", "source_url": f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"}
134
 
135
  tasks_results = get_task_summary(results)
136
 
 
15
  df = pd.DataFrame(finished_models)
16
 
17
  desc = """
18
+ This is an automated PR created with https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard
19
 
20
+ The purpose of this PR is to add evaluation results from the Open Portuguese LLM Leaderboard to your model card.
21
 
22
+ If you encounter any issues, please report them to https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard/discussions
23
  """
24
 
25
  def search(df, value):
 
28
 
29
 
30
  def get_details_url(repo):
31
+ #author, model = repo.split("/")
32
+ return f"https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_raw_results/tree/main/{repo}"
33
 
34
 
35
  def get_query_url(repo):
36
+ return f"https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query={repo}"
37
 
38
 
39
  def get_task_summary(results):
40
  return {
41
+ "ENEM":
42
+ {"dataset_type":"enem_challenge",
43
+ "dataset_name":"ENEM Challenge",
44
+ "metric_type":"acc",
45
+ "metric_value":results["ENEM"],
46
+ "dataset_config": None,
47
+ "dataset_split":"train",
48
  "dataset_revision":None,
49
+ "dataset_args":{"num_few_shot": 3},
50
+ "metric_name":"accuracy"
51
  },
52
+ "BLUEX":
53
+ {"dataset_type":"bluex",
54
+ "dataset_name":"BLUEX",
55
+ "metric_type":"acc",
56
+ "metric_value":results["BLUEX"],
57
+ "dataset_config": None,
58
+ "dataset_split":"train",
59
  "dataset_revision":None,
60
+ "dataset_args":{"num_few_shot": 3},
61
+ "metric_name":"accuracy"
62
  },
63
+ "OAB Exams":
64
+ {"dataset_type":"oab_exams",
65
+ "dataset_name":"OAB Exams",
 
66
  "metric_type":"acc",
67
+ "metric_value":results["OAB Exams"],
68
+ "dataset_config": None,
69
+ "dataset_split":"train",
70
  "dataset_revision":None,
71
+ "dataset_args":{"num_few_shot": 3},
72
  "metric_name":"accuracy"
73
+ },
74
+ "ASSIN2 RTE":
75
+ {"dataset_type":"assin2_rte",
76
+ "dataset_name":"ASSIN2 RTE",
77
+ "metric_type":"f1_macro",
78
+ "metric_value":results["ASSIN2 RTE"],
79
+ "dataset_config": None,
80
+ "dataset_split":"test",
 
81
  "dataset_revision":None,
82
+ "dataset_args":{"num_few_shot": 15},
83
+ "metric_name":"f1-macro"
84
+ },
85
+ "ASSIN2 STS":
86
+ {"dataset_type":"assin2_sts",
87
+ "dataset_name":"ASSIN2 STS",
88
+ "metric_type":"pearson",
89
+ "metric_value":results["ASSIN2 STS"],
90
+ "dataset_config": None,
 
 
 
 
 
 
 
 
 
 
 
 
91
  "dataset_split":"test",
92
+ "dataset_revision":None,
93
+ "dataset_args":{"num_few_shot": 15},
94
+ "metric_name":"pearson"
95
+ },
96
+ "FAQUAD NLI":
97
+ {"dataset_type":"fquad_nli",
98
+ "dataset_name":"FAQUAD NLI",
99
+ "metric_type":"f1_macro",
100
+ "metric_value":results["FAQUAD NLI"],
101
+ "dataset_config": None,
102
+ "dataset_split":"test",
103
+ "dataset_revision":None,
104
+ "dataset_args":{"num_few_shot": 15},
105
+ "metric_name":"f1-macro"
106
+ },
107
+ "HateBR":
108
+ {"dataset_type":"hatebr_offensive",
109
+ "dataset_name":"HateBR",
110
+ "metric_type":"f1_macro",
111
+ "metric_value":results["HateBR"],
112
+ "dataset_config": None,
113
+ "dataset_split":"test",
114
+ "dataset_revision":None,
115
+ "dataset_args":{"num_few_shot": 25},
116
+ "metric_name":"f1-macro"
117
+ },
118
+ "PT Hate Speech":
119
+ {"dataset_type":"portuguese_hate_speech",
120
+ "dataset_name":"PT Hate Speech",
121
+ "metric_type":"f1_macro",
122
+ "metric_value":results["PT Hate Speech"],
123
+ "dataset_config": None,
124
+ "dataset_split":"test",
125
+ "dataset_revision":None,
126
+ "dataset_args":{"num_few_shot": 25},
127
+ "metric_name":"f1-macro"
128
+ },
129
+ "tweetSentBR":
130
+ {"dataset_type":"tweetsentbr",
131
+ "dataset_name":"tweetSentBR",
132
+ "metric_type":"f1_macro",
133
+ "metric_value":results["tweetSentBR"],
134
+ "dataset_config": None,
135
+ "dataset_split":"test",
136
+ "dataset_revision":None,
137
+ "dataset_args":{"num_few_shot": 25},
138
+ "metric_name":"f1-macro"
139
+ }
140
  }
141
 
142
 
 
149
  md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
150
 
151
  text = f"""
152
+ # [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
153
  Detailed results can be found [here]({get_details_url(repo)})
154
 
155
  {md_writer.dumps()}
 
161
  card = ModelCard.load(repo, token=token)
162
  results = search(df, repo)
163
 
164
+ common = {"task_type": 'text-generation', "task_name": 'Text Generation', "source_name": "Open Portuguese LLM Leaderboard", "source_url": get_query_url(repo)}
165
 
166
  tasks_results = get_task_summary(results)
167
 
openllm.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import json
5
+
6
+
7
+ def get_json_format_data():
8
+ url = 'https://eduagarcia-open-pt-llm-leaderboard.hf.space/'
9
+ response = requests.get(url)
10
+ soup = BeautifulSoup(response.content, 'html.parser')
11
+
12
+ script_elements = soup.find_all('script')
13
+ json_format_data = json.loads(str(script_elements[1])[31:-10])
14
+ return json_format_data
15
+
16
+
17
+ def get_datas(data):
18
+ for component_index in range(10, 50, 1): # component_index sometimes changes when they update the space, we can use this "for" loop to avoid changing component index manually
19
+ try:
20
+ result_list = []
21
+ i = 0
22
+ while True:
23
+ try:
24
+ results = data['components'][component_index]['props']['value']['data'][i]
25
+ columns = data['components'][component_index]['props']['headers']
26
+ try:
27
+ results_json = {"T": results[0], "Model": results[-1]}
28
+
29
+ if len(columns) < 15: # If there are less than 15 columns (this number can definetly change), we know that we are trying wrong component index, so breaking loop to try next component index.
30
+ break
31
+
32
+ for col_index, col_name in enumerate(columns[2:-1], start=2):
33
+ results_json[col_name] = results[col_index]
34
+
35
+ except IndexError: # Wrong component index, so breaking loop to try next component index. (NOTE: More than one component index can give you some results but we must find the right component index to get all results we want.)
36
+ break
37
+ result_list.append(results_json)
38
+ i += 1
39
+ except IndexError: # No rows to extract so return the list (We know it is the right component index because we didn't break out of loop on the other exception.)
40
+ return result_list
41
+ except (KeyError, TypeError):
42
+ continue
43
+
44
+ return result_list