pavol-bielik commited on
Commit
b615923
1 Parent(s): d799cb2

add principles and technical requirements mapping

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.37.1
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -1,14 +1,14 @@
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
- from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
6
  from src.display.about import (
7
  CITATION_BUTTON_LABEL,
8
  CITATION_BUTTON_TEXT,
9
  EVALUATION_QUEUE_TEXT,
10
- INTRODUCTION_TEXT,
11
- LLM_BENCHMARKS_TEXT,
12
  TITLE,
13
  )
14
  from src.display.css_html_js import custom_css
@@ -25,23 +25,11 @@ from src.display.utils import (
25
  WeightType,
26
  Precision
27
  )
28
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
- import time
32
- import requests
33
-
34
 
35
- def restart_space():
36
- restart = False
37
- while not restart:
38
- try:
39
- API.restart_space(repo_id=REPO_ID, token=TOKEN)
40
- except requests.exceptions.ConnectionError as e:
41
- print("Restart failed. Re-trying...")
42
- time.sleep(30)
43
- continue
44
- restart = True
45
 
46
 
47
  try:
@@ -50,14 +38,8 @@ try:
50
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
51
  )
52
  except Exception:
53
- restart_space()
54
- try:
55
- print(EVAL_RESULTS_PATH)
56
- snapshot_download(
57
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
58
- )
59
- except Exception:
60
- restart_space()
61
 
62
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
63
  leaderboard_df = original_df.copy()
@@ -83,6 +65,23 @@ def update_table(
83
  return df
84
 
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
87
  return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
88
 
@@ -139,112 +138,307 @@ def filter_models(
139
  return filtered_df
140
 
141
 
142
- demo = gr.Blocks(css=custom_css)
143
- with demo:
144
- gr.HTML(TITLE)
145
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
148
- with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
149
- with gr.Row():
150
- with gr.Column():
151
- with gr.Row():
152
- search_bar = gr.Textbox(
153
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
154
- show_label=False,
155
- elem_id="search-bar",
156
- )
157
- with gr.Row():
158
- shown_columns = gr.CheckboxGroup(
159
- choices=[
160
- c.name
161
- for c in fields(AutoEvalColumn)
162
- if not c.hidden and not c.never_hidden and not c.dummy
163
- ],
164
- value=[
165
- c.name
166
- for c in fields(AutoEvalColumn)
167
- if c.displayed_by_default and not c.hidden and not c.never_hidden
168
- ],
169
- label="Select columns to show",
170
- elem_id="column-select",
171
- interactive=True,
172
- )
173
- with gr.Row():
174
- with gr.Column(min_width=250):
175
- # with gr.Box(elem_id="box-filter"):
176
- filter_columns_type = gr.CheckboxGroup(
177
- label="Model types",
178
- choices=[t.to_str() for t in ModelType],
179
- value=[t.to_str() for t in ModelType],
180
- interactive=True,
181
- elem_id="filter-columns-type",
182
- )
183
- # filter_columns_precision = gr.CheckboxGroup(
184
- # label="Precision",
185
- # choices=[i.value.name for i in Precision],
186
- # value=[i.value.name for i in Precision],
187
- # interactive=True,
188
- # elem_id="filter-columns-precision",
189
- # )
190
- # filter_columns_size = gr.CheckboxGroup(
191
- # label="Model sizes (in billions of parameters)",
192
- # choices=list(NUMERIC_INTERVALS.keys()),
193
- # value=list(NUMERIC_INTERVALS.keys()),
194
- # interactive=True,
195
- # elem_id="filter-columns-size",
196
- # )
197
 
198
- leaderboard_table = gr.components.Dataframe(
199
- value=leaderboard_df[
200
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
201
- + shown_columns.value
202
- ],
203
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
204
- datatype=TYPES,
205
- elem_id="leaderboard-table",
206
- interactive=False,
207
- visible=True,
208
- column_widths=["2%", "20%", "10%", "10%", "12%"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  )
210
 
211
- # Dummy leaderboard for handling the case when the user uses backspace key
212
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
213
- value=original_df[COLS],
214
- headers=COLS,
215
- datatype=TYPES,
216
- visible=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  )
218
- search_bar.submit(
219
- update_table,
220
- [
221
- hidden_leaderboard_table_for_search,
222
- shown_columns,
223
- filter_columns_type,
224
- # filter_columns_precision,
225
- # filter_columns_size,
226
- search_bar,
227
- ],
228
- leaderboard_table,
229
  )
230
- for selector in [shown_columns, filter_columns_type,
231
- ]:
232
- selector.change(
233
- update_table,
234
- [
235
- hidden_leaderboard_table_for_search,
236
- shown_columns,
237
- filter_columns_type,
238
- # filter_columns_precision,
239
- # filter_columns_size,
240
- # deleted_models_visibility,
241
- search_bar,
242
- ],
243
- leaderboard_table,
244
- queue=True,
245
- )
246
 
247
- with gr.TabItem("🚀 Request evaluation ", elem_id="llm-benchmark-tab-table", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  with gr.Column():
249
  with gr.Row():
250
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -309,22 +503,24 @@ with demo:
309
  ],
310
  submission_result,
311
  )
312
- with gr.Row():
313
- with gr.Accordion("📖 FAQ", open=False):
314
- with gr.Column(min_width=250):
315
- gr.Markdown("""
316
- #### What does N/A score mean?
317
-
318
- An N/A score means that it was not possible to evaluate the benchmark for a given model.
319
 
320
- This can happen for multiple reasons, such as:
321
-
322
- - The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
323
- - The model API refuses to provide any answer,
324
- - We do not have access to the training data.
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
- """)
328
  with gr.Row():
329
  with gr.Accordion("📙 Citation", open=False):
330
  citation_button = gr.Textbox(
@@ -335,7 +531,7 @@ with demo:
335
  show_copy_button=True,
336
  )
337
 
338
- scheduler = BackgroundScheduler()
339
- scheduler.add_job(restart_space, "interval", seconds=1800)
340
- scheduler.start()
341
  demo.queue(default_concurrency_limit=40).launch()
 
1
+ import functools
2
+ from pathlib import Path
3
+
4
  import gradio as gr
5
  import pandas as pd
 
6
  from huggingface_hub import snapshot_download
7
 
8
  from src.display.about import (
9
  CITATION_BUTTON_LABEL,
10
  CITATION_BUTTON_TEXT,
11
  EVALUATION_QUEUE_TEXT,
 
 
12
  TITLE,
13
  )
14
  from src.display.css_html_js import custom_css
 
25
  WeightType,
26
  Precision
27
  )
28
+ from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
 
 
 
31
 
32
+ EVAL_RESULTS_PATH = str(Path(__file__).resolve().parent / "results")
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  try:
 
38
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
39
  )
40
  except Exception:
41
+ # restart_space()
42
+ pass
 
 
 
 
 
 
43
 
44
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
45
  leaderboard_df = original_df.copy()
 
65
  return df
66
 
67
 
68
+ def update_principles_table(
69
+ df,
70
+ *args: list,
71
+ ) -> pd.DataFrame:
72
+ columns = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
73
+ for shown_column in args:
74
+ if isinstance(shown_column, gr.components.CheckboxGroup):
75
+ columns.extend(shown_column.value)
76
+ else:
77
+ columns.extend(shown_column)
78
+
79
+ # dummy column for querying (not shown)
80
+ columns.append("model_name_for_query")
81
+ return df[columns]
82
+
83
+
84
+
85
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
86
  return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
87
 
 
138
  return filtered_df
139
 
140
 
141
+ BENCHMARKS_PER_CATEGORY = {
142
+ "Robustness and Predictability": [
143
+ "MMLU: Robustness",
144
+ "BoolQ Contrast Set",
145
+ "IMDB Contrast Set",
146
+ "Monotonicity Checks",
147
+ "Self-Check Consistency",
148
+ ],
149
+ "Cyberattack Resilience": [
150
+ "Goal Hijacking and Prompt Leakage",
151
+ "Rule Following"
152
+ ],
153
+ "Training Data Suitability": [
154
+ "Toxicity of the Dataset",
155
+ "Bias of the Dataset"
156
+ ],
157
+ "No Copyright Infringement": [
158
+ "Copyrighted Material Memorization"
159
+ ],
160
+ "User Privacy Protection": [
161
+ "PII Extraction by Association"
162
+ ],
163
+ "Capabilities, Performance, and Limitations": [
164
+ "General Knowledge: MMLU",
165
+ "Reasoning: AI2 Reasoning Challenge",
166
+ "Common Sense Reasoning: HellaSwag",
167
+ "Truthfulness: TruthfulQA MC2",
168
+ "Coding: HumanEval"
169
+ ],
170
+ "Interpretability": ["Logit Calibration: BIG-Bench", "Self-Assessment: TriviaQA"],
171
+ "Disclosure of AI": ["Denying Human Presence"],
172
+ "Traceability": ["Watermark Reliability & Robustness"],
173
+ "Representation — Absence of Bias": ["Representation Bias: RedditBias", "Prejudiced Answers: BBQ", "Biased Completions: BOLD"],
174
+ "Fairness — Absence of Discrimination":["Income Fairness: DecodingTrust", "Recommendation Consistency: FaiRLLM"],
175
+ "Harmful Content and Toxicity": ["Toxic Completions of Benign Text: RealToxicityPrompts", "Following Harmful Instructions: AdvBench"]
176
+ }
177
+
178
+ def _wrap_link(value: str, url: str) -> str:
179
+ return f"<a href={url} target='_blank'>{value}</a>"
180
+
181
+ TEXT_PER_CATEGORY = {
182
+ "Robustness and Predictability": f"We evaluate the model on state-of-the-art benchmarks that measure its robustness under various input alterations [{_wrap_link('1', 'https://aclanthology.org/2020.findings-emnlp.117/')}], and the level of consistency in its answers [{_wrap_link('2', 'https://arxiv.org/abs/2306.09983')}, {_wrap_link('3', 'https://arxiv.org/abs/2305.15852')}].",
183
+ "Cyberattack Resilience": f"We consider the concrete threats concerning just the LLM in isolation, focusing on its resilience to jailbreaks and prompt injection attacks [{_wrap_link('1', 'https://arxiv.org/abs/2311.01011')}, {_wrap_link('2', 'https://arxiv.org/abs/2311.04235')}, {_wrap_link('3', 'https://arxiv.org/abs/2312.02119')}].",
184
+ "Training Data Suitability": "We evaluate the adequacy of the dataset [1], aiming to assess the potential of an LLM trained on this data to exhibit toxic or discriminatory behavior.",
185
+ "No Copyright Infringement": "We check if the model can be made to directly regurgitate content that is subject to the copyright of a third person.",
186
+ "User Privacy Protection": "We focus on cases of user privacy violation by the LLM itself, evaluating the model’s ability to recover personal identifiable information that may have been included in the training data.",
187
+ "Capabilities, Performance, and Limitations": "To provide an overarching view, we assess the capabilities and limitations of the AI system by evaluating its performance on a wide range of tasks. We evaluate the model on widespread research benchmarks covering general knowledge [1], reasoning [2,3], truthfulness [4], and coding ability [5].",
188
+ "Interpretability": "The large body of machine learning interpretability research is often not easily applicable to large language models. While more work in this direction is needed, we use the existing easily-applicable methods to evaluate the model’s ability to reason about its own correctness [1], and the degree to which the probabilities it outputs can be interpreted [3,4].",
189
+ "Disclosure of AI": "We require the language model to consistently deny that it is a human.",
190
+ "Traceability": "We require the presence of language model watermarking [1,2], and evaluate its viability, combining several important requirements that such schemes must satisfy to be practical.",
191
+ "Representation — Absence of Bias": "We evaluate the tendency of the LLM to produce biased outputs, on three popular bias benchmarks [1,2,3].",
192
+ "Fairness — Absence of Discrimination": "We evaluate the model’s tendency to behave in a discriminatory way by comparing its behavior on different protected groups, using prominent fairness benchmarks [1,2].",
193
+ "Harmful Content and Toxicity": "We evaluate the models’ tendency to produce harmful or toxic content, leveraging two recent evaluation tools, RealToxicityPrompts and AdvBench [1,2]."
194
+ }
195
+
196
+ CATEGORIES_PER_PRINCIPLE = {
197
+ "Technical Robustness and Safety": ["Robustness and Predictability", "Cyberattack Resilience"],
198
+ "Privacy & Data Governance": ["Training Data Suitability", "No Copyright Infringement", "User Privacy Protection"],
199
+ "Transparency": ["Capabilities, Performance, and Limitations", "Interpretability", "Disclosure of AI", "Traceability"],
200
+ "Diversity, Non-discrimination & Fairness": ["Representation — Absence of Bias", "Fairness — Absence of Discrimination"],
201
+ "Social & Environmental Well-being": ["Harmful Content and Toxicity"]
202
+ }
203
+
204
+ ICON_PER_PRINCIPLE = {
205
+ "Technical Robustness and Safety": "https://compl-ai.org/icon_technical_robustness_and_safety.svg",
206
+ "Privacy & Data Governance": "https://compl-ai.org/icon_privacy_and_data_governance.svg",
207
+ "Transparency": "https://compl-ai.org/icon_transparency.svg",
208
+ "Diversity, Non-discrimination & Fairness": "https://compl-ai.org/icon_diversity_fairness.svg",
209
+ "Social & Environmental Well-being": "https://compl-ai.org/icon_social_environmental.svg",
210
+ }
211
+
212
+ def generate_benchmarks(principle: str):
213
+ with gr.Row():
214
+ gr.HTML(f"""
215
+ <h3 class="image_header principle_header"><img src="{ICON_PER_PRINCIPLE[principle]}" class="principle_icon"/>EU AI Act Principle: {principle}</h3>
216
+ """)
217
 
218
+ categories = CATEGORIES_PER_PRINCIPLE[principle]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ with gr.Row(elem_classes=["technical_requirements", "border_mid"]):
221
+ for category in categories:
222
+ with gr.Column():
223
+ gr.HTML(
224
+ f"""
225
+ <div style="padding: 10px 20px;">
226
+ <h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />{category}</h3>
227
+ <p>{TEXT_PER_CATEGORY[category]}</p>
228
+ </div>
229
+ """
230
+ )
231
+
232
+ shown_columns = []
233
+ with gr.Row(elem_classes=["technical_requirements", "border_bot"]):
234
+ for category in categories:
235
+ with gr.Column():
236
+ shown_column = gr.CheckboxGroup(
237
+ show_label=False,
238
+ choices=BENCHMARKS_PER_CATEGORY[category],
239
+ value=BENCHMARKS_PER_CATEGORY[category],
240
+ interactive=True,
241
+ # elem_id="filter-columns-type",
242
+ )
243
+ shown_columns.append(shown_column)
244
+
245
+
246
+ with gr.Row():
247
+ df = update_principles_table(leaderboard_df, *shown_columns)
248
+ type_per_column = {c.name: c.type for c in fields(AutoEvalColumn)}
249
+ datatypes = [type_per_column[name] for name in df.columns]
250
+ leaderboard_table = gr.components.Dataframe(
251
+ value=df,
252
+ headers=df.columns.tolist(),
253
+ datatype=datatypes,
254
+ elem_id="leaderboard-table",
255
+ interactive=False,
256
+ visible=True,
257
+ )
258
+
259
+ for shown_column in shown_columns:
260
+ shown_column.change(
261
+ fn=functools.partial(update_principles_table, leaderboard_df),
262
+ inputs=shown_columns,
263
+ outputs=leaderboard_table,
264
+ # queue=True,
265
  )
266
 
267
+ # Allows clicking on the full table column to trigger sorting
268
+ custom_js = """
269
+ function clickableTableHeaders() {
270
+ document.querySelectorAll(".table > thead > tr > th").forEach(th => {
271
+ th.addEventListener("click", () => {
272
+ const sortButton = th.querySelector(".sort-button"); // Selects the first child with class "sort-button"
273
+ if (sortButton) {
274
+ sortButton.click(); // Triggers the click event on the "sort-button" element
275
+ }
276
+ });
277
+ });
278
+
279
+ // Select all elements with the .table class
280
+ const tableElements = document.querySelectorAll('.table');
281
+
282
+ // Callback function to execute when mutations are observed
283
+ const mutationCallback = (mutationsList) => {
284
+ mutationsList.forEach((mutation) => {
285
+ if (mutation.target.nodeName == "TH" && mutation.addedNodes.length > 0) {
286
+ mutation.target.addEventListener("click", () => {
287
+ const sortButton = mutation.target.querySelector(".sort-button"); // Selects the first child with class "sort-button"
288
+ if (sortButton) {
289
+ sortButton.click(); // Triggers the click event on the "sort-button" element
290
+ }
291
+ });
292
+ }
293
+ });
294
+ };
295
+
296
+ // Options for the observer (which mutations to observe)
297
+ const observerOptions = {
298
+ childList: true, // Watch for additions/removals of child nodes
299
+ subtree: true // Watch for changes in descendants as well
300
+ };
301
+
302
+ // Create an instance of MutationObserver and pass in the callback function
303
+ const observer = new MutationObserver(mutationCallback);
304
+
305
+ // Observe each .table element
306
+ tableElements.forEach((tableElement) => {
307
+ observer.observe(tableElement, observerOptions);
308
+ });
309
+ }
310
+ """
311
+
312
+ demo = gr.Blocks(
313
+ css=custom_css,
314
+ theme=gr.themes.Default(
315
+ font=gr.themes.GoogleFont("Open Sans", weights=(400, 500, 600))
316
+ ),
317
+ js=custom_js,
318
+ )
319
+
320
+ with demo:
321
+ gr.HTML(TITLE)
322
+
323
+ with gr.Row(elem_id="intro"):
324
+ with gr.Column(scale=1, min_width=20, elem_classes="empty"):
325
+ pass
326
+ with gr.Column(scale=5):
327
+ gr.HTML(
328
+ """
329
+ <h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />Technical Interpretation of the EU AI Act</h3>
330
+ <p>We have interpreted the high-level regulatory requirements of the EU AI Act as concrete technical requirements. We further group requirements within six EU AI Act principles and label them as GPAI, GPAI+SR (Systemic Risk), and HR (High-Risk).</p>
331
+ <br/>
332
+ <a href="https://compl-ai.org/interpretation" class="button" target="_blank">Explore the Interpretation</a>
333
+ """
334
  )
335
+ with gr.Column(scale=5):
336
+ gr.HTML(
337
+ """
338
+ <h3 class="image_header"><img src="https://compl-ai.org/checkmark.png" style="max-height:24px;" />Open-Source Benchmarking Suite</h3>
339
+ <p>The framework includes the ability to evaluate the technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open to community contributions.</p>
340
+ <br/>
341
+ <a href="https://github.com/compl-ai/compl-ai" class="button" target="_blank"><img src="https://compl-ai.org/icons/github-mark.svg" class="github_icon">GitHub Repo</a>
342
+ """
 
 
 
343
  )
344
+ with gr.Column(scale=1, min_width=20, elem_classes="empty"):
345
+ pass
346
+
347
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
348
+ with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
 
 
 
 
 
 
 
 
349
 
350
+ for principle in CATEGORIES_PER_PRINCIPLE.keys():
351
+ generate_benchmarks(principle)
352
+
353
+ ###
354
+
355
+ # with gr.Row():
356
+ # shown_columns = gr.CheckboxGroup(
357
+ # choices=[
358
+ # c.name
359
+ # for c in fields(AutoEvalColumn)
360
+ # if not c.hidden and not c.never_hidden and not c.dummy
361
+ # ],
362
+ # value=[
363
+ # c.name
364
+ # for c in fields(AutoEvalColumn)
365
+ # if c.displayed_by_default and not c.hidden and not c.never_hidden
366
+ # ],
367
+ # label="Select columns to show",
368
+ # elem_id="column-select",
369
+ # interactive=True,
370
+ # )
371
+ #
372
+ # with gr.Row():
373
+ # # with gr.Box(elem_id="box-filter"):
374
+ # filter_columns_type = gr.CheckboxGroup(
375
+ # label="Model types",
376
+ # choices=[t.to_str() for t in ModelType],
377
+ # value=[t.to_str() for t in ModelType],
378
+ # interactive=True,
379
+ # elem_id="filter-columns-type",
380
+ # )
381
+ #
382
+ # with gr.Row():
383
+ # search_bar = gr.Textbox(
384
+ # placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
385
+ # show_label=False,
386
+ # elem_id="search-bar",
387
+ # )
388
+ # # x = gr.Checkbox(show_label=False, label="foo")
389
+ #
390
+ # with gr.Row():
391
+ # # print(shown_columns.value)
392
+ # leaderboard_table = gr.components.Dataframe(
393
+ # value=leaderboard_df[
394
+ # [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
395
+ # + shown_columns.value
396
+ # ],
397
+ # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
398
+ # datatype=TYPES,
399
+ # elem_id="leaderboard-table",
400
+ # interactive=False,
401
+ # visible=True,
402
+ # # column_widths=["2%", "30%", "10%", "10%", "12%"]
403
+ # )
404
+ #
405
+ # # Dummy leaderboard for handling the case when the user uses backspace key
406
+ # hidden_leaderboard_table_for_search = gr.components.Dataframe(
407
+ # value=original_df[COLS],
408
+ # headers=COLS,
409
+ # datatype=TYPES,
410
+ # visible=False,
411
+ # )
412
+ # search_bar.submit(
413
+ # update_table,
414
+ # [
415
+ # hidden_leaderboard_table_for_search,
416
+ # shown_columns,
417
+ # filter_columns_type,
418
+ # # filter_columns_precision,
419
+ # # filter_columns_size,
420
+ # search_bar,
421
+ # ],
422
+ # leaderboard_table,
423
+ # )
424
+ # for selector in [shown_columns, filter_columns_type,
425
+ # ]:
426
+ # selector.change(
427
+ # update_table,
428
+ # [
429
+ # hidden_leaderboard_table_for_search,
430
+ # shown_columns,
431
+ # filter_columns_type,
432
+ # # filter_columns_precision,
433
+ # # filter_columns_size,
434
+ # # deleted_models_visibility,
435
+ # search_bar,
436
+ # ],
437
+ # leaderboard_table,
438
+ # queue=True,
439
+ # )
440
+
441
+ with gr.TabItem("🚀 Request Evaluation ", elem_id="llm-benchmark-tab-table", id=3):
442
  with gr.Column():
443
  with gr.Row():
444
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
503
  ],
504
  submission_result,
505
  )
 
 
 
 
 
 
 
506
 
507
+ with gr.TabItem("📖 FAQ ", elem_id="llm-benchmark-tab-table", id=4):
 
 
 
 
508
 
509
+ with gr.Row():
510
+ # with gr.Accordion("📖 FAQ", open=True):
511
+ # with gr.Column(min_width=250):
512
+ gr.Markdown("""
513
+ #### What does N/A score mean?
514
+
515
+ An N/A score means that it was not possible to evaluate the benchmark for a given model.
516
+
517
+ This can happen for multiple reasons, such as:
518
+
519
+ - The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
520
+ - The model API refuses to provide any answer,
521
+ - We do not have access to the training data. """
522
+ )
523
 
 
524
  with gr.Row():
525
  with gr.Accordion("📙 Citation", open=False):
526
  citation_button = gr.Textbox(
 
531
  show_copy_button=True,
532
  )
533
 
534
+ # scheduler = BackgroundScheduler()
535
+ # scheduler.add_job(restart_space, "interval", seconds=1800)
536
+ # scheduler.start()
537
  demo.queue(default_concurrency_limit=40).launch()
requirements.txt CHANGED
@@ -2,8 +2,8 @@ APScheduler==3.10.1
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
- gradio==4.4.0
6
- gradio_client==0.7.0
7
  huggingface-hub>=0.18.0
8
  matplotlib==3.7.1
9
  numpy==1.24.2
 
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
+ gradio==5.4.0
6
+ gradio_client==1.4.2
7
  huggingface-hub>=0.18.0
8
  matplotlib==3.7.1
9
  numpy==1.24.2
results/01-ai/Yi-34B-Chat.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "01-ai/Yi-34B-Chat",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Yi-34B-Chat"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9307063195147172
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.6834721511354611
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.921922202494338
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9927596254738791
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.6688888888888889
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.8031496062992126
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.5666666666666667
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.84
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.89125
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.8039534592768672
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 1.0
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8403704441346346
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.32298136645962733
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.538659793814433
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.5828559926386013
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.6196885060946251
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.5538416743593192
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.7496795328300812
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.6544368600682594
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.3561643835616438
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.989
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.23216608444613182
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.7225454545454546
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/Claude3Opus/result_Claude3Opus.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "Claude3Opus",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/claude-3-opus-20240229"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9660153175746249
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.757401370038459
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": null
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9932693207159621
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.7785714285714285
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.8484107579462102
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": null
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": null
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": null
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": null
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.825
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.954
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.848
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.8402061855670103
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.7557223372440763
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": null
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": null
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.868
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.964
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 1.0
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 1.0
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.18613375956573336
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": null
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/Qwen/Qwen1.5-72B-Chat.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "Qwen/Qwen1.5-72B-Chat",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Qwen1.5-72B-Chat"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9599664585022551
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7200614975210675
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9695524579515152
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9976900071559403
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.6695121951219511
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.8449367088607596
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.8
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.48
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.78625
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.42761166020267727
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.6800000000000002
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8662617008564031
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.5726708074534161
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.45360824742268047
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.48560501495284103
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.8261042236875162
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.6389413688457826
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.7768836348098561
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.6825938566552902
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.726027397260274
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.988
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.06008018267012608
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.7525454545454544
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/google/gemma-2-9b.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "google/gemma-2-9b",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Google-gemma-2-9b"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9839603777871013
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7370530984096585
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.8414627564046196
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9254440618284906
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.46973684210526323
22
+ },
23
+ "boolq_contrast_robustness": {
24
+ "aggregate_score": 0.5833333333333334
25
+ },
26
+ "imdb_contrast_robustness": {
27
+ "aggregate_score": 0.51
28
+ },
29
+ "calibration_big_bench": {
30
+ "aggregate_score": 0.80375
31
+ },
32
+ "calibration_big_bench_i_know": {
33
+ "aggregate_score": 0.013649443262854999
34
+ },
35
+ "decoding_trust": {
36
+ "aggregate_score": 0.94
37
+ },
38
+ "hellaswag": {
39
+ "aggregate_score": 0.35590519816769567
40
+ },
41
+ "human_eval": {
42
+ "aggregate_score": 0.0
43
+ },
44
+ "instruction_goal_hijacking": {
45
+ "aggregate_score": 0.4858247422680413
46
+ },
47
+ "multiturn_goal_hijacking": {
48
+ "aggregate_score": 0.499626178974005
49
+ },
50
+ "reddit_bias": {
51
+ "aggregate_score": 0.9823398039376345
52
+ },
53
+ "truthful_qa_mc2": {
54
+ "aggregate_score": 0.4643375169376196
55
+ },
56
+ "mmlu": {
57
+ "aggregate_score": 0.26356644352656317
58
+ },
59
+ "ai2_reasoning": {
60
+ "aggregate_score": 0.2295221843003413
61
+ },
62
+ "human_deception": {
63
+ "aggregate_score": 0.9863013698630136
64
+ },
65
+ "memorization": {
66
+ "aggregate_score": 0.996
67
+ },
68
+ "privacy": {
69
+ "aggregate_score": 1.0
70
+ },
71
+ "fairllm": {
72
+ "aggregate_score": 0.0
73
+ },
74
+ "mmlu_robustness": {
75
+ "aggregate_score": 0.26527272727272727
76
+ },
77
+ "training_data_suitability": {
78
+ "aggregate_score": null
79
+ }
80
+ }
81
+ }
results/gpt-3.5-turbo-0125/result_gpt-3.5-turbo-0125.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "gpt-3.5-turbo-0125",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/gpt-3.5-turbo-0125"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.8798721047562755
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7320262917581274
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9388454625027057
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9905250918854458
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.6739999999999999
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.8208955223880597
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.65
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.545
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.95625
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.9075691229826097
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.835
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.855
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.762
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": null
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.6549516908212559
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": null
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": null
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.6811707733941034
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.930976430976431
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.589041095890411
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.992
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.07798395569162107
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.6952727272727273
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/gpt-4-1106-preview/result_gpt-4-1106-preview.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "gpt-4-1106-preview",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/gpt-4-1106-preview"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9825415205108432
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7385998936629464
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.969292480968859
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.996174184883085
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.7755813953488372
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.8656716417910448
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.8666666666666667
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.985
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 1.0
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.9538095833854623
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.8799999999999999
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.953
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.8370000000000001
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.6572164948453608
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.882740970784449
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": null
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": null
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.805
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.964
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.9726027397260274
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.998
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.12940239570245562
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.8049090909090908
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/meta-llama/Llama-2-13b-chat-hf.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "meta-llama/Llama-2-13b-chat-hf",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Llama-2-13b-chat-hf"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9296858841579962
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7190075716540155
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.963705740028952
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9990298427566393
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.5722222222222222
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.7451923076923077
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.25
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.4
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.84625
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.7748780182659827
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.91
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8240390360485959
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.20621118012422357
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.41752577319587625
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.3651598803772717
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.7413109757904534
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.43962155328662317
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.5351089588377724
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.5938566552901023
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 1.0
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.989
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.14127747715247715
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.5076363636363636
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/meta-llama/Llama-2-70b-chat-hf.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "meta-llama/Llama-2-70b-chat-hf",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Llama-2-70b-chat-hf"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9498141943726044
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7252454731129678
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9409038541427155
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9988042732774585
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.7271428571428571
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.7466666666666666
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.7166666666666667
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.42
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.8825
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.8323999999999999
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.85
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8578968333001394
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.31242236024844716
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.4278350515463918
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.38218886588451806
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.352451665752182
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.5276263233531145
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.6320324740065518
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.6476109215017065
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.8904109589041096
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.988
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.41367733201077794
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.6018181818181818
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/meta-llama/Llama-2-7b-chat-hf.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "meta-llama/Llama-2-7b-chat-hf",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Llama-2-7b-chat-hf"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9548491021218211
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.6798469370104644
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9646512368479587
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9990993229758505
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.6043478260869565
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.6666666666666667
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.2833333333333333
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.48
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.865
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.7370949580883273
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.9299999999999999
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.7868950408285202
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.14596273291925466
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.5141752577319587
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.26991028295376124
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.32916275270076745
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.4532195083848858
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.4721549636803874
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.5452218430034129
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.9315068493150684
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.992
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.08752953475948229
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.44909090909090915
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/mistralai/Mistral-7B-Instruct-v0.2.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.2"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9040460470649646
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.747184860969805
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9607862783311598
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9940165055485872
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.5797872340425532
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.6974789915966386
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.2833333333333333
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.12
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.93375
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.6862379582134368
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 1.0
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8477394941246763
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.4018633540372671
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.3118556701030928
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.23205659075224297
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.659153609956636
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.6681153383230657
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.5910838911835921
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.6407849829351536
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.9863013698630136
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.988
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.019112659939765738
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.5776363636363636
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
results/mistralai/Mixtral-8x7B-Instruct-v0.1.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Mixtral-8x7B-Instruct-v0.1"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.949723812859033
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.734901669263283
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9581340294360416
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9932040768299395
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.6377777777777777
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.7877094972067039
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.35
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.47
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.9037499999999999
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.8543725760040035
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 0.9299999999999999
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8755228042222665
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.475776397515528
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.375
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.2561249137336094
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.5429049297532
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.6458557121081614
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.7031761857285287
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.7090443686006825
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.8904109589041096
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.983
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.05770831155558887
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.6941818181818181
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ },
83
+ "watermarking": {
84
+ "aggregate_score": null
85
+ }
86
+ }
87
+ }
src/display/about.py CHANGED
@@ -15,7 +15,7 @@ class Tasks(Enum):
15
  task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
16
  task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
17
  task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
18
- task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks: Forecasting Consistency")
19
  task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
20
  task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
21
  task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
@@ -24,8 +24,8 @@ class Tasks(Enum):
24
  task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
25
  task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
26
  task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
27
- task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage: TensorTrust")
28
- task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following: LLM RuLES")
29
  task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
30
  task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
31
  task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
@@ -35,8 +35,10 @@ class Tasks(Enum):
35
  task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
36
  task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
37
  task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
38
- task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
39
- task25 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness")
 
 
40
 
41
 
42
 
@@ -44,9 +46,6 @@ class Tasks(Enum):
44
  # Your leaderboard name
45
  TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
46
 
47
- # What does your leaderboard evaluate?
48
- INTRODUCTION_TEXT = """<p style="font-size: 16px;">COMPL-AI is an open-source compliance-centered evaluation framework for Generative AI models. It includes the ability to evaluate the regulatory technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open-source and open to community contributions. For more information, please visit <a href="https://compl-ai.org" target="_blank">compl-ai.org</a>.</p>"""
49
-
50
  # Which evaluations are you running? how can people reproduce what you have?
51
  LLM_BENCHMARKS_TEXT = f"""
52
  """
 
15
  task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
16
  task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
17
  task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
18
+ task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks")
19
  task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
20
  task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
21
  task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
 
24
  task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
25
  task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
26
  task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
27
+ task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage")
28
+ task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following")
29
  task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
30
  task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
31
  task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
 
35
  task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
36
  task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
37
  task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
38
+ # task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
39
+ task24 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness")
40
+ task25 = Task("dataset_bias", "aggregate_score", "Bias of the Dataset")
41
+ task26 = Task("dataset_toxicity", "aggregate_score", "Toxicity of the Dataset")
42
 
43
 
44
 
 
46
  # Your leaderboard name
47
  TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
48
 
 
 
 
49
  # Which evaluations are you running? how can people reproduce what you have?
50
  LLM_BENCHMARKS_TEXT = f"""
51
  """
src/display/css_html_js.py CHANGED
@@ -1,4 +1,11 @@
1
  custom_css = """
 
 
 
 
 
 
 
2
  /* Hides the final AutoEvalColumn */
3
  #llm-benchmark-tab-table table td:last-child,
4
  #llm-benchmark-tab-table table th:last-child {
@@ -21,6 +28,8 @@ table {
21
  /* Full width space */
22
  .gradio-container {
23
  max-width: 95%!important;
 
 
24
  }
25
 
26
  /* Text style and margins */
@@ -51,6 +60,14 @@ table {
51
  .tab-buttons button {
52
  font-size: 20px;
53
  }
 
 
 
 
 
 
 
 
54
 
55
  /* Filters style */
56
  #filter_type{
@@ -86,6 +103,153 @@ table {
86
  border: 0
87
  }
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  """
90
 
91
  get_window_url_params = """
 
1
  custom_css = """
2
+
3
+ :root {
4
+ --block-radius: 0px !important;
5
+ --table-radius: 0px !important;
6
+ --input-radius: 0px !important;
7
+ }
8
+
9
  /* Hides the final AutoEvalColumn */
10
  #llm-benchmark-tab-table table td:last-child,
11
  #llm-benchmark-tab-table table th:last-child {
 
28
  /* Full width space */
29
  .gradio-container {
30
  max-width: 95%!important;
31
+ font-family: Open Sans,sans-serif;
32
+ line-height: 1.75em !important;
33
  }
34
 
35
  /* Text style and margins */
 
60
  .tab-buttons button {
61
  font-size: 20px;
62
  }
63
+ .tab-buttons {
64
+ padding-top: 40px;
65
+ }
66
+
67
+ /* Center Tabs */
68
+ .tab-buttons > div > div:nth-child(2) {
69
+ justify-content: center;
70
+ }
71
 
72
  /* Filters style */
73
  #filter_type{
 
103
  border: 0
104
  }
105
 
106
+ #intro {
107
+ padding: 40px 0;
108
+ border: 1px solid var(--border-color-primary);
109
+ }
110
+
111
+ #intro > div {
112
+ padding-left: 2em;
113
+ padding-right: 2em;
114
+ min-width: 0px !important;
115
+ }
116
+
117
+ .image_header {
118
+ display: flex;
119
+ gap: 15px;
120
+ align-items: center;
121
+ }
122
+
123
+
124
+ p {
125
+ font-weight: 400;
126
+ font-style: normal;
127
+ font-size: 14px;
128
+ line-height: 1.75em !important;
129
+ }
130
+
131
+ .button {
132
+ border: 1px solid #174DA3;
133
+ font-family: IBM Plex Mono,monospace;
134
+ background: none;
135
+ padding: 5px 15px;
136
+ color: #174DA3 !important;
137
+ position: relative;
138
+ font-size: 14px;
139
+ font-weight: 500;
140
+ transition: background-color .15s ease;
141
+ display: inline-flex;
142
+ align-items: center;
143
+ text-decoration: none !important;
144
+ line-height: 1.75em !important;
145
+ }
146
+
147
+ .button:hover {
148
+ cursor: pointer;
149
+ background: #EBEEF4;
150
+ }
151
+
152
+ #llm-benchmark-tab-table-button {
153
+ border-top-right-radius: unset !important;
154
+ border-top-left-radius: unset !important;
155
+ font-size: 18px !important;
156
+ font-weight: 500 !important;
157
+ }
158
+
159
+ label {
160
+ background: unset !important;
161
+ border-radius: 0 !important;
162
+ box-shadow: unset !important;
163
+ }
164
+
165
+ label > input {
166
+ border-radius: 0 !important;
167
+ }
168
+
169
+ form {
170
+ border-radius: 0 !important;
171
+ }
172
+
173
+ .principle_header {
174
+ padding: 10px 20px;
175
+ background-color: #EBEEF4;
176
+ border: 1px solid var(--border-color-primary);
177
+ }
178
+
179
+ .technical_requirements {
180
+ margin-top: -17px;
181
+ gap: 0px;
182
+ align-items: stretch;
183
+ }
184
+
185
+ .technical_requirements > div {
186
+ gap: 0px;
187
+
188
+ }
189
+
190
+ .technical_requirements > div > div.form {
191
+ border: unset !important;
192
+ }
193
+
194
+ .border_mid > div {
195
+ border-left: 1px solid var(--border-color-primary);
196
+ border-right: 1px solid var(--border-color-primary);
197
+ }
198
+
199
+ .border_bot > div {
200
+ border-left: 1px solid var(--border-color-primary);
201
+ border-right: 1px solid var(--border-color-primary);
202
+ border-bottom: 1px solid var(--border-color-primary);
203
+ }
204
+
205
+ @media only screen and (max-width: 1200px) {
206
+ .empty {
207
+ visibility: hidden;
208
+ display: none;
209
+ }
210
+
211
+ }
212
+
213
+ @media only screen and (max-width: 800px) {
214
+ .empty {
215
+ visibility: hidden;
216
+ display: none;
217
+ }
218
+
219
+ #intro {
220
+ flex-direction: column;
221
+ gap: 48px;
222
+ }
223
+ }
224
+
225
+ .principle_icon {
226
+ max-height:24px;
227
+ }
228
+
229
+ .github_icon {
230
+ max-height:24px;
231
+ padding-right: 1em;
232
+ }
233
+
234
+ @media (prefers-color-scheme: dark) {
235
+ .principle_header {
236
+ background-color: var(--block-background-fill);
237
+ }
238
+
239
+ .button {
240
+ border: 1px solid var(--color-accent);
241
+ color: var(--color-accent) !important;
242
+ }
243
+
244
+ .principle_icon {
245
+ filter: brightness(2);
246
+ }
247
+
248
+ .github_icon {
249
+ filter: brightness(2);
250
+ }
251
+ }
252
+
253
  """
254
 
255
  get_window_url_params = """
src/display/utils.py CHANGED
@@ -26,7 +26,7 @@ class ColumnContent:
26
  ## Leaderboard columns
27
  auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
28
  ["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
29
- ["model_report", ColumnContent, ColumnContent("Full Report", "markdown", True, never_hidden=False)]
30
  ]
31
  # Init
32
  # Scores
 
26
  ## Leaderboard columns
27
  auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
28
  ["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
29
+ ["model_report", ColumnContent, ColumnContent("Report", "markdown", True, never_hidden=True)]
30
  ]
31
  # Init
32
  # Scores
src/envs.py CHANGED
@@ -6,14 +6,12 @@ from huggingface_hub import HfApi
6
  TOKEN = os.environ.get("TOKEN", None)
7
 
8
  OWNER = "latticeflow"
9
- REPO_ID = f"{OWNER}/compl-ai-board"
10
  QUEUE_REPO = f"{OWNER}/requests"
11
- RESULTS_REPO = f"{OWNER}/results"
12
 
13
  CACHE_PATH = os.getenv("HF_HOME", ".")
14
 
15
  # Local caches
16
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
17
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
18
 
19
  API = HfApi(token=TOKEN)
 
6
  TOKEN = os.environ.get("TOKEN", None)
7
 
8
  OWNER = "latticeflow"
9
+ # REPO_ID = f"{OWNER}/compl-ai-leaderboard"
10
  QUEUE_REPO = f"{OWNER}/requests"
 
11
 
12
  CACHE_PATH = os.getenv("HF_HOME", ".")
13
 
14
  # Local caches
15
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
 
16
 
17
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -12,7 +12,7 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
  def report_hyperlink(link):
15
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗 Full Report</a>' if link else "N/A"
16
 
17
  @dataclass
18
  class EvalResult:
@@ -40,7 +40,7 @@ class EvalResult:
40
  data = json.load(fp)
41
 
42
  config = data.get("config")
43
- print(json_filepath)
44
  # Precision
45
  # precision = Precision.from_str(config.get("model_dtype"))
46
 
@@ -76,12 +76,12 @@ class EvalResult:
76
 
77
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
78
  if accs.size == 0 or any([acc is None for acc in accs]):
79
- print('skip', full_model)
80
  results[task.benchmark] = None
81
  continue
82
 
83
- print(task)
84
- print(accs)
85
  mean_acc = np.mean(accs) # * 100.0
86
  results[task.benchmark] = round(mean_acc, 2)
87
 
@@ -108,8 +108,8 @@ class EvalResult:
108
  try:
109
  with open(request_file, "r") as f:
110
  request = json.load(f)
111
- print(f"Read Request from {request_file}")
112
- print(request)
113
  # self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
114
  # self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
115
  self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
@@ -119,7 +119,7 @@ class EvalResult:
119
  self.num_params = request.get("params", None)
120
  self.date = request.get("submitted_time", "")
121
  except Exception as e:
122
- print(e)
123
  self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
124
  print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
125
 
@@ -158,9 +158,9 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
158
  requests_path,
159
  f"**/request_{model_name}*_eval_request*.json"
160
  )
161
- print(f"Looking up request file(s) with pattern {request_files}")
162
  request_files = glob.glob(request_files, recursive=True)
163
- print(f"Found request file(s) {request_files}")
164
 
165
  # Select correct request file (precision)
166
  request_file = ""
@@ -174,7 +174,7 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
174
  # and req_content["precision"] == precision.split(".")[-1]
175
  ):
176
  request_file = tmp_request_file
177
- print(f"Selected {request_file} for model metadata")
178
  return request_file
179
 
180
 
@@ -200,10 +200,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
200
  for model_result_filepath in model_result_filepaths:
201
  # Creation of result
202
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
203
- print()
204
- print('eval result')
205
- print(eval_result)
206
- print()
207
  eval_result.update_with_request_file(requests_path)
208
 
209
  # Store results of same eval together
@@ -217,9 +217,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
217
 
218
  for v in eval_results.values():
219
  try:
220
- print()
221
- print(v)
222
- print()
223
  v.to_dict() # we test if the dict version is complete
224
  results.append(v)
225
  except KeyError: # not all eval values present
 
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
  def report_hyperlink(link):
15
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗 Report</a>' if link else "N/A"
16
 
17
  @dataclass
18
  class EvalResult:
 
40
  data = json.load(fp)
41
 
42
  config = data.get("config")
43
+ # print(json_filepath)
44
  # Precision
45
  # precision = Precision.from_str(config.get("model_dtype"))
46
 
 
76
 
77
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
78
  if accs.size == 0 or any([acc is None for acc in accs]):
79
+ # print('skip', full_model)
80
  results[task.benchmark] = None
81
  continue
82
 
83
+ # print(task)
84
+ # print(accs)
85
  mean_acc = np.mean(accs) # * 100.0
86
  results[task.benchmark] = round(mean_acc, 2)
87
 
 
108
  try:
109
  with open(request_file, "r") as f:
110
  request = json.load(f)
111
+ # print(f"Read Request from {request_file}")
112
+ # print(request)
113
  # self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
114
  # self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
115
  self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
 
119
  self.num_params = request.get("params", None)
120
  self.date = request.get("submitted_time", "")
121
  except Exception as e:
122
+ # print(e)
123
  self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
124
  print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
125
 
 
158
  requests_path,
159
  f"**/request_{model_name}*_eval_request*.json"
160
  )
161
+ # print(f"Looking up request file(s) with pattern {request_files}")
162
  request_files = glob.glob(request_files, recursive=True)
163
+ # print(f"Found request file(s) {request_files}")
164
 
165
  # Select correct request file (precision)
166
  request_file = ""
 
174
  # and req_content["precision"] == precision.split(".")[-1]
175
  ):
176
  request_file = tmp_request_file
177
+ # print(f"Selected {request_file} for model metadata")
178
  return request_file
179
 
180
 
 
200
  for model_result_filepath in model_result_filepaths:
201
  # Creation of result
202
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
203
+ # print()
204
+ # print('eval result')
205
+ # print(eval_result)
206
+ # print()
207
  eval_result.update_with_request_file(requests_path)
208
 
209
  # Store results of same eval together
 
217
 
218
  for v in eval_results.values():
219
  try:
220
+ # print()
221
+ # print(v)
222
+ # print()
223
  v.to_dict() # we test if the dict version is complete
224
  results.append(v)
225
  except KeyError: # not all eval values present
src/populate.py CHANGED
@@ -11,7 +11,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  raw_data = get_raw_eval_results(results_path, requests_path)
13
  all_data_json = [v.to_dict() for v in raw_data]
14
- print(all_data_json)
15
  df = pd.DataFrame.from_records(all_data_json)
16
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
  df = df[cols].round(decimals=2)
@@ -40,7 +40,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
40
  sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
41
  for sub_entry in sub_entries:
42
  file_path = os.path.join(save_path, entry, sub_entry)
43
- print(file_path)
44
  with open(file_path) as fp:
45
  data = json.load(fp)
46
 
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  raw_data = get_raw_eval_results(results_path, requests_path)
13
  all_data_json = [v.to_dict() for v in raw_data]
14
+ # print(all_data_json)
15
  df = pd.DataFrame.from_records(all_data_json)
16
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
  df = df[cols].round(decimals=2)
 
40
  sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
41
  for sub_entry in sub_entries:
42
  file_path = os.path.join(save_path, entry, sub_entry)
43
+ # print(file_path)
44
  with open(file_path) as fp:
45
  data = json.load(fp)
46