Clémentine commited on
Commit
3d87820
1 Parent(s): 0d5b177

Updated system to connect the different repos

Browse files
Files changed (3) hide show
  1. app.py +91 -66
  2. content.py +7 -1
  3. scorer.py +81 -0
app.py CHANGED
@@ -1,40 +1,57 @@
1
  import os
 
 
2
  from email.utils import parseaddr
3
 
4
  import gradio as gr
5
  import pandas as pd
 
6
 
7
  from datasets import load_dataset
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  from huggingface_hub import HfApi
10
 
11
  # InfoStrings
12
- from content import *
 
13
 
14
  BALM_TOKEN = os.environ.get("BALM_TOKEN", None)
15
- owner="balm" # change to balm once possible
16
 
 
 
 
 
17
  api = HfApi()
18
 
 
 
 
19
  eval_results = {}
20
  for level in range(1, 4):
21
- eval_results[level] = load_dataset(f"{owner}/BALM_ResultsLevel{level}", use_auth_token=BALM_TOKEN, split="dev")
22
 
23
  eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
24
  eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
25
  eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
26
 
 
 
 
 
 
 
 
27
  def restart_space():
28
- api.restart_space(repo_id=f"{owner}/BALM_Leaderboard", token=BALM_TOKEN)
29
 
30
 
31
- COLS = ["Model", "Organisation", "Reported accuracy ⬆️"]
32
- TYPES = ["str", "str", "number",]
33
 
34
  def add_new_eval(
35
  level_of_dev: str,
36
  model: str,
37
- score: float,
38
  organisation: str,
39
  mail: str,
40
  ):
@@ -43,68 +60,86 @@ def add_new_eval(
43
  # Very basic email parsing
44
  _, parsed_mail = parseaddr(mail)
45
  if not "@" in parsed_mail:
46
- valid_mail = "Please provide a valid email adress."
47
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{valid_mail}</p>"
48
 
49
  print("Adding new eval")
50
 
51
  # Check if the combination model/org already exists and prints a warning message if yes
52
  if model.lower() in set(eval_results[level]["model"]) and organisation.lower() in set(eval_results[level]["organisation"]):
53
- duplicate_request_message = "This model has been already submitted."
54
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # Actual submission
57
  eval_entry = {
58
  "model": model,
59
- "score": score,
60
  "organisation": organisation,
61
  "mail": mail,
62
  }
63
  eval_results[level] = eval_results[level].add_item(eval_entry)
64
- eval_results[level].push_to_hub(f"{owner}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split="dev")
 
65
 
66
- success_message = f"Model {model} submitted by {organisation}."
67
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{success_message}</p>"
68
 
69
 
70
  def refresh():
71
  eval_results = {}
72
  for level in range(1, 4):
73
- eval_results[level] = load_dataset(f"{owner}/BALM_ResultsLevel{level}", use_auth_token=BALM_TOKEN, split="dev")
74
  eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
75
  eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
76
  eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
77
  return eval_dataframe_1, eval_dataframe_2, eval_dataframe_3
78
 
 
 
 
79
 
80
- custom_css = """
81
- #changelog-text {
82
- font-size: 16px !important;
83
- }
84
-
85
- #changelog-text h2 {
86
- font-size: 18px !important;
87
- }
88
 
89
- .markdown-text {
90
- font-size: 16px !important;
91
- }
92
-
93
- #citation-button span {
94
- font-size: 16px !important;
95
- }
96
-
97
- #citation-button textarea {
98
- font-size: 16px !important;
99
- }
100
-
101
- #citation-button > label > button {
102
- margin: 6px;
103
- transform: scale(1.3);
104
- }
105
- """
106
-
107
- demo = gr.Blocks(css=custom_css)
108
  with demo:
109
  gr.HTML(TITLE)
110
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
@@ -122,26 +157,17 @@ with demo:
122
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
123
 
124
  with gr.Tab("Results: Level 1"):
125
- with gr.Tab("Results on Dev Set"):
126
- leaderboard_table_1 = gr.components.Dataframe(
127
- value=eval_dataframe_1, headers=COLS, datatype=TYPES, max_rows=20
128
- )
129
- with gr.Tab("Results on Test Set"):
130
- gr.Textbox(label="Info", value="The test set is currently private! Come back when performances on the dev set increased!")
131
  with gr.Tab("Results: Level 2"):
132
- with gr.Tab("Results on Dev Set"):
133
- leaderboard_table_2 = gr.components.Dataframe(
134
- value=eval_dataframe_2, headers=COLS, datatype=TYPES, max_rows=20
135
- )
136
- with gr.Tab("Results on Test Set"):
137
- gr.Textbox(label="Info", value="The test set is currently private! Come back when performances on the dev set increased!")
138
  with gr.Tab("Results: Level 3"):
139
- with gr.Tab("Results on Dev Set"):
140
- leaderboard_table_3 = gr.components.Dataframe(
141
- value=eval_dataframe_3, headers=COLS, datatype=TYPES, max_rows=20
142
- )
143
- with gr.Tab("Results on Test Set"):
144
- gr.Textbox(label="Info", value="The test set is currently private! Come back when performances on the dev set increased!")
145
 
146
  refresh_button = gr.Button("Refresh")
147
  refresh_button.click(
@@ -153,13 +179,12 @@ with demo:
153
  leaderboard_table_3,
154
  ],
155
  )
156
-
157
  with gr.Accordion("Submit a new model for evaluation"):
158
  with gr.Row():
159
  with gr.Column():
160
- level_of_dev = gr.Radio(["Level 1", "Level 2", "Level 3"], value="Level 1", label="Dev set")
161
  model_name_textbox = gr.Textbox(label="Model name")
162
- score = gr.Textbox(label="Score")
163
  with gr.Column():
164
  organisation = gr.Textbox(label="Organisation")
165
  mail = gr.Textbox(label="Contact email")
@@ -169,9 +194,9 @@ with demo:
169
  submit_button.click(
170
  add_new_eval,
171
  [
172
- level_of_dev,
173
  model_name_textbox,
174
- score,
175
  organisation,
176
  mail
177
  ],
 
1
  import os
2
+ import json
3
+ import datetime
4
  from email.utils import parseaddr
5
 
6
  import gradio as gr
7
  import pandas as pd
8
+ import numpy as np
9
 
10
  from datasets import load_dataset
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
  from huggingface_hub import HfApi
13
 
14
  # InfoStrings
15
+ from scorer import question_scorer
16
+ from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CHANGELOG_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
17
 
18
  BALM_TOKEN = os.environ.get("BALM_TOKEN", None)
 
19
 
20
+ OWNER="balm"
21
+ SUBMISSION_DATASET = f"{OWNER}/submissions"
22
+
23
+ SPLIT="validation" #Change to test once we are ready to go
24
  api = HfApi()
25
 
26
+ os.makedirs("scored", exist_ok=True)
27
+
28
+ # Display the results
29
  eval_results = {}
30
  for level in range(1, 4):
31
+ eval_results[level] = load_dataset(f"{OWNER}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split=SPLIT)
32
 
33
  eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
34
  eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
35
  eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
36
 
37
+ # Gold answers
38
+ gold_results = {}
39
+ for level in range(1, 4):
40
+ level_dataset = load_dataset(f"{OWNER}/BALM", f"2023_level{level}", split=SPLIT, token=BALM_TOKEN)
41
+ gold_results[level] = {row["task_id"]: row["ground_truth"] for row in level_dataset}
42
+
43
+
44
  def restart_space():
45
+ api.restart_space(repo_id=f"{OWNER}/BALM_Leaderboard", token=BALM_TOKEN)
46
 
47
 
48
+ COLS = ["Model", "Score ⬆️", "Organisation"]
49
+ TYPES = ["str", "number", "str",]
50
 
51
  def add_new_eval(
52
  level_of_dev: str,
53
  model: str,
54
+ path_to_file,
55
  organisation: str,
56
  mail: str,
57
  ):
 
60
  # Very basic email parsing
61
  _, parsed_mail = parseaddr(mail)
62
  if not "@" in parsed_mail:
63
+ return format_warning("Please provide a valid email adress.")
 
64
 
65
  print("Adding new eval")
66
 
67
  # Check if the combination model/org already exists and prints a warning message if yes
68
  if model.lower() in set(eval_results[level]["model"]) and organisation.lower() in set(eval_results[level]["organisation"]):
69
+ return format_warning("This model has been already submitted.")
70
+
71
+ # Save submitted file
72
+ api.upload_file(
73
+ repo_id=SUBMISSION_DATASET,
74
+ path_or_fileobj=path_to_file.name,
75
+ path_in_repo=f"{organisation}/{model}/level{level}_raw_{datetime.datetime.today()}.jsonl",
76
+ repo_type="dataset",
77
+ token=BALM_TOKEN
78
+ )
79
+
80
+ # Compute score
81
+ file_path = path_to_file.name
82
+ total_score = 0
83
+ with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
84
+ with open(file_path, 'r') as f:
85
+ for line in f:
86
+ task = json.loads(line)
87
+
88
+ if "model_answer" not in task:
89
+ raise Exception("No model_answer key in the file provided")
90
+ answer = task["model_answer"]
91
+ task_id = task["task_id"]
92
+
93
+ score = question_scorer(task['model_answer'], gold_results[level][task_id])
94
+
95
+ scored_file.write(
96
+ json.dumps({
97
+ "id": task_id,
98
+ "model_answer": answer,
99
+ "score": score
100
+ }) + "\n"
101
+ )
102
+
103
+ total_score += score
104
+
105
+ # Save scored file
106
+ api.upload_file(
107
+ repo_id=SUBMISSION_DATASET,
108
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
109
+ path_in_repo=f"{organisation}/{model}/level{level}_scored_{datetime.datetime.today()}.jsonl",
110
+ repo_type="dataset",
111
+ token=BALM_TOKEN
112
+ )
113
 
114
  # Actual submission
115
  eval_entry = {
116
  "model": model,
117
+ "score": total_score,
118
  "organisation": organisation,
119
  "mail": mail,
120
  }
121
  eval_results[level] = eval_results[level].add_item(eval_entry)
122
+ # TODO: change split to "test" once we have the actual results
123
+ eval_results[level].push_to_hub(f"{OWNER}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split=SPLIT)
124
 
125
+ return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed")
 
126
 
127
 
128
  def refresh():
129
  eval_results = {}
130
  for level in range(1, 4):
131
+ eval_results[level] = load_dataset(f"{OWNER}/BALM_ResultsLevel{level}", use_auth_token=BALM_TOKEN, split=SPLIT)
132
  eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail"))
133
  eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail"))
134
  eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail"))
135
  return eval_dataframe_1, eval_dataframe_2, eval_dataframe_3
136
 
137
+ def upload_file(files):
138
+ file_paths = [file.name for file in files]
139
+ return file_paths
140
 
 
 
 
 
 
 
 
 
141
 
142
+ demo = gr.Blocks()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  with demo:
144
  gr.HTML(TITLE)
145
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
157
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
158
 
159
  with gr.Tab("Results: Level 1"):
160
+ leaderboard_table_1 = gr.components.Dataframe(
161
+ value=eval_dataframe_1, headers=COLS, datatype=TYPES, interactive=False,
162
+ )
 
 
 
163
  with gr.Tab("Results: Level 2"):
164
+ leaderboard_table_2 = gr.components.Dataframe(
165
+ value=eval_dataframe_2, headers=COLS, datatype=TYPES, interactive=False,
166
+ )
 
 
 
167
  with gr.Tab("Results: Level 3"):
168
+ leaderboard_table_3 = gr.components.Dataframe(
169
+ value=eval_dataframe_3, headers=COLS, datatype=TYPES, interactive=False,
170
+ )
 
 
 
171
 
172
  refresh_button = gr.Button("Refresh")
173
  refresh_button.click(
 
179
  leaderboard_table_3,
180
  ],
181
  )
 
182
  with gr.Accordion("Submit a new model for evaluation"):
183
  with gr.Row():
184
  with gr.Column():
185
+ level_of_test = gr.Radio(["Level 1", "Level 2", "Level 3"], value="Level 1", label="{split} set level")
186
  model_name_textbox = gr.Textbox(label="Model name")
187
+ file_output = gr.File()
188
  with gr.Column():
189
  organisation = gr.Textbox(label="Organisation")
190
  mail = gr.Textbox(label="Contact email")
 
194
  submit_button.click(
195
  add_new_eval,
196
  [
197
+ level_of_test,
198
  model_name_textbox,
199
+ file_output,
200
  organisation,
201
  mail
202
  ],
content.py CHANGED
@@ -27,6 +27,12 @@ CITATION_BUTTON_TEXT = r"""@misc{balm, # TODO
27
  title = {Benchmark for Augmented Language Models},
28
  year = {2023},
29
  #publisher = {Hugging Face},
30
- #howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
31
  }"""
32
 
 
 
 
 
 
 
 
27
  title = {Benchmark for Augmented Language Models},
28
  year = {2023},
29
  #publisher = {Hugging Face},
30
+ #howpublished = "\url{https://huggingface.co/spaces/balm/}"
31
  }"""
32
 
33
+
34
+ def format_warning(msg):
35
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
36
+
37
+ def format_log(msg):
38
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
scorer.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import string
4
+
5
+ import numpy as np
6
+
7
+ def normalize_text(text: str) -> str:
8
+ "From QuAC"
9
+ def remove_articles(text: str) -> str:
10
+ return re.sub(r"\b(a|an|the)\b", " ", text)
11
+
12
+ def white_space_fix(text: str) -> str:
13
+ return " ".join(text.split())
14
+
15
+ def homogeneize_numbers(text: str) -> str:
16
+ try:
17
+ return str(float(text))
18
+ except ValueError:
19
+ return text
20
+
21
+ def remove_punc(text: str) -> str:
22
+ exclude = set(string.punctuation)
23
+ return "".join(ch for ch in text if ch not in exclude)
24
+
25
+ def remove_punc2(text: str) -> str:
26
+ "From Grégoire's code, removes all punctuation, nicer than remove_punc"
27
+ translator = str.maketrans('', '', string.punctuation)
28
+ return text.translate(translator)
29
+
30
+ def lower(text: str) -> str:
31
+ return text.lower()
32
+
33
+ def _tokenize(text):
34
+ return re.split(" ", text)
35
+
36
+ tokens = [white_space_fix(remove_articles(homogeneize_numbers(remove_punc2(lower(t))))) for t in _tokenize(text)]
37
+ return " ".join([t for t in tokens if t != ""]).strip()
38
+
39
+ def extract_answer(input_str: str, prompt_sep: str = 'FINAL ANSWER: ') -> str:
40
+ answer = input_str.split(prompt_sep)[-1].strip()
41
+ return answer
42
+
43
+ def extract_bow(input_str: str) -> list[str]:
44
+ return input_str.split(" ")
45
+
46
+ def numbers_equals_in_bow(gold_list: list, pred_list: list) -> bool:
47
+ # Numbers in prediction bag of words
48
+ pred_numbers = []
49
+ for text in pred_list:
50
+ try:
51
+ pred_numbers.append(str(float(text)))
52
+ except ValueError:
53
+ continue
54
+
55
+ for text in gold_list:
56
+ try:
57
+ number = str(float(text))
58
+ if number not in pred_numbers:
59
+ return False
60
+ except ValueError:
61
+ continue
62
+
63
+ return True
64
+
65
+ def affix_quasi_exact_match(gold: str, pred: str) -> float:
66
+ if not pred:
67
+ return 0
68
+
69
+ normalized_pred = normalize_text(pred)
70
+ normalized_gold = normalize_text(gold)
71
+ bow_pred = extract_bow(pred)
72
+ bow_gold = extract_bow(gold)
73
+
74
+ if normalized_pred.startswith(normalized_gold) or normalized_pred.endswith(normalized_gold):
75
+ if numbers_equals_in_bow(bow_gold, bow_pred):
76
+ return 1
77
+
78
+ return 0
79
+
80
+ def question_scorer(gold: str, pred: str) -> float:
81
+ return affix_quasi_exact_match(gold, pred)