Terry Zhuo
commited on
Commit
•
ae7a86d
1
Parent(s):
1e748fb
fix
Browse files- app.py +27 -8
- src/utils.py +1 -2
app.py
CHANGED
@@ -109,7 +109,7 @@ def select_columns(df, columns):
|
|
109 |
return filtered_df
|
110 |
|
111 |
|
112 |
-
def
|
113 |
if query == "all":
|
114 |
return df[leaderboard_table.columns]
|
115 |
else:
|
@@ -118,6 +118,16 @@ def filter_items(df, leaderboard_table, query):
|
|
118 |
return filtered_df[leaderboard_table.columns]
|
119 |
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
def search_table(df, leaderboard_table, query):
|
122 |
filtered_df = df[(df["model"].str.contains(query, case=False))]
|
123 |
return filtered_df[leaderboard_table.columns]
|
@@ -174,13 +184,18 @@ with demo:
|
|
174 |
show_label=False,
|
175 |
elem_id="search-bar",
|
176 |
)
|
177 |
-
|
178 |
label="⏚ Filter model types",
|
179 |
-
choices=["all", "🟢 base", "🔶 instruction-tuned", "EXT external-evaluation"],
|
180 |
value="all",
|
181 |
elem_id="filter-columns",
|
182 |
)
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
184 |
leaderboard_df = gr.components.Dataframe(
|
185 |
value=df[
|
186 |
[
|
@@ -210,9 +225,14 @@ with demo:
|
|
210 |
[hidden_leaderboard_df, leaderboard_df, search_bar],
|
211 |
leaderboard_df,
|
212 |
)
|
213 |
-
|
214 |
-
|
215 |
-
[hidden_leaderboard_df, leaderboard_df,
|
|
|
|
|
|
|
|
|
|
|
216 |
leaderboard_df,
|
217 |
)
|
218 |
shown_columns.change(
|
@@ -229,7 +249,6 @@ with demo:
|
|
229 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
230 |
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
|
231 |
- `size` is the amount of activated model weight during inference.
|
232 |
-
- Some instruction-tuned models are marked with 🟢 symbol, as they miss the chat templates in their tokenizer configurations.
|
233 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
234 |
- For more details check the 📝 About section.
|
235 |
- Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
|
|
|
109 |
return filtered_df
|
110 |
|
111 |
|
112 |
+
def filter_types(df, leaderboard_table, query):
|
113 |
if query == "all":
|
114 |
return df[leaderboard_table.columns]
|
115 |
else:
|
|
|
118 |
return filtered_df[leaderboard_table.columns]
|
119 |
|
120 |
|
121 |
+
def filter_direct_complete(df, leaderboard_table, query):
|
122 |
+
if query == "all":
|
123 |
+
return df[leaderboard_table.columns]
|
124 |
+
|
125 |
+
if query == "chat template":
|
126 |
+
return df[~df["direct_complete"]][leaderboard_table.columns]
|
127 |
+
else:
|
128 |
+
return df[df["direct_complete"]][leaderboard_table.columns]
|
129 |
+
|
130 |
+
|
131 |
def search_table(df, leaderboard_table, query):
|
132 |
filtered_df = df[(df["model"].str.contains(query, case=False))]
|
133 |
return filtered_df[leaderboard_table.columns]
|
|
|
184 |
show_label=False,
|
185 |
elem_id="search-bar",
|
186 |
)
|
187 |
+
filter_types_columns = gr.Radio(
|
188 |
label="⏚ Filter model types",
|
189 |
+
choices=["all", "🟢 base", "🔶 instruction-tuned"], #, "EXT external-evaluation"],
|
190 |
value="all",
|
191 |
elem_id="filter-columns",
|
192 |
)
|
193 |
+
filter_prompting_columns = gr.Radio(
|
194 |
+
label="⏚ Filter prompting",
|
195 |
+
choices=["all", "chat template", "direct complete"],
|
196 |
+
value="all",
|
197 |
+
elem_id="filter-direct-complete",
|
198 |
+
)
|
199 |
leaderboard_df = gr.components.Dataframe(
|
200 |
value=df[
|
201 |
[
|
|
|
225 |
[hidden_leaderboard_df, leaderboard_df, search_bar],
|
226 |
leaderboard_df,
|
227 |
)
|
228 |
+
filter_types_columns.change(
|
229 |
+
filter_types,
|
230 |
+
[hidden_leaderboard_df, leaderboard_df, filter_types_columns],
|
231 |
+
leaderboard_df,
|
232 |
+
)
|
233 |
+
filter_prompting_columns.change(
|
234 |
+
filter_direct_complete,
|
235 |
+
[hidden_leaderboard_df, leaderboard_df, filter_prompting_columns],
|
236 |
leaderboard_df,
|
237 |
)
|
238 |
shown_columns.change(
|
|
|
249 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
250 |
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
|
251 |
- `size` is the amount of activated model weight during inference.
|
|
|
252 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
253 |
- For more details check the 📝 About section.
|
254 |
- Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
|
src/utils.py
CHANGED
@@ -24,12 +24,11 @@ def fields(raw_class):
|
|
24 |
class AutoEvalColumn: # Auto evals column
|
25 |
model_type_symbol = ColumnContent("type", "str", True)
|
26 |
model = ColumnContent("model", "markdown", True)
|
27 |
-
size = ColumnContent("size", "number", False)
|
28 |
complete_score = ColumnContent("complete", "number", True)
|
29 |
instruct_score = ColumnContent("instruct", "number", True)
|
30 |
elo_mle = ColumnContent("elo_mle", "number", True)
|
31 |
dummy = ColumnContent("model", "str", True)
|
32 |
-
|
33 |
|
34 |
|
35 |
def model_hyperlink(link, model_name):
|
|
|
24 |
class AutoEvalColumn: # Auto evals column
|
25 |
model_type_symbol = ColumnContent("type", "str", True)
|
26 |
model = ColumnContent("model", "markdown", True)
|
|
|
27 |
complete_score = ColumnContent("complete", "number", True)
|
28 |
instruct_score = ColumnContent("instruct", "number", True)
|
29 |
elo_mle = ColumnContent("elo_mle", "number", True)
|
30 |
dummy = ColumnContent("model", "str", True)
|
31 |
+
size = ColumnContent("size", "number", False)
|
32 |
|
33 |
|
34 |
def model_hyperlink(link, model_name):
|