Spaces:
Running
Running
pavol-bielik
commited on
Commit
•
b615923
1
Parent(s):
d799cb2
add principles and technical requirements mapping
Browse files- README.md +1 -1
- app.py +336 -140
- requirements.txt +2 -2
- results/01-ai/Yi-34B-Chat.json +87 -0
- results/Claude3Opus/result_Claude3Opus.json +87 -0
- results/Qwen/Qwen1.5-72B-Chat.json +87 -0
- results/google/gemma-2-9b.json +81 -0
- results/gpt-3.5-turbo-0125/result_gpt-3.5-turbo-0125.json +87 -0
- results/gpt-4-1106-preview/result_gpt-4-1106-preview.json +87 -0
- results/meta-llama/Llama-2-13b-chat-hf.json +87 -0
- results/meta-llama/Llama-2-70b-chat-hf.json +87 -0
- results/meta-llama/Llama-2-7b-chat-hf.json +87 -0
- results/mistralai/Mistral-7B-Instruct-v0.2.json +87 -0
- results/mistralai/Mixtral-8x7B-Instruct-v0.1.json +87 -0
- src/display/about.py +7 -8
- src/display/css_html_js.py +164 -0
- src/display/utils.py +1 -1
- src/envs.py +1 -3
- src/leaderboard/read_evals.py +18 -18
- src/populate.py +2 -2
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🥇
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.4.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -1,14 +1,14 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
from src.display.about import (
|
7 |
CITATION_BUTTON_LABEL,
|
8 |
CITATION_BUTTON_TEXT,
|
9 |
EVALUATION_QUEUE_TEXT,
|
10 |
-
INTRODUCTION_TEXT,
|
11 |
-
LLM_BENCHMARKS_TEXT,
|
12 |
TITLE,
|
13 |
)
|
14 |
from src.display.css_html_js import custom_css
|
@@ -25,23 +25,11 @@ from src.display.utils import (
|
|
25 |
WeightType,
|
26 |
Precision
|
27 |
)
|
28 |
-
from src.envs import
|
29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
from src.submission.submit import add_new_eval
|
31 |
-
import time
|
32 |
-
import requests
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
restart = False
|
37 |
-
while not restart:
|
38 |
-
try:
|
39 |
-
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
40 |
-
except requests.exceptions.ConnectionError as e:
|
41 |
-
print("Restart failed. Re-trying...")
|
42 |
-
time.sleep(30)
|
43 |
-
continue
|
44 |
-
restart = True
|
45 |
|
46 |
|
47 |
try:
|
@@ -50,14 +38,8 @@ try:
|
|
50 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
51 |
)
|
52 |
except Exception:
|
53 |
-
restart_space()
|
54 |
-
|
55 |
-
print(EVAL_RESULTS_PATH)
|
56 |
-
snapshot_download(
|
57 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
58 |
-
)
|
59 |
-
except Exception:
|
60 |
-
restart_space()
|
61 |
|
62 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
63 |
leaderboard_df = original_df.copy()
|
@@ -83,6 +65,23 @@ def update_table(
|
|
83 |
return df
|
84 |
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
87 |
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
88 |
|
@@ -139,112 +138,307 @@ def filter_models(
|
|
139 |
return filtered_df
|
140 |
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
-
|
148 |
-
with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
|
149 |
-
with gr.Row():
|
150 |
-
with gr.Column():
|
151 |
-
with gr.Row():
|
152 |
-
search_bar = gr.Textbox(
|
153 |
-
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
154 |
-
show_label=False,
|
155 |
-
elem_id="search-bar",
|
156 |
-
)
|
157 |
-
with gr.Row():
|
158 |
-
shown_columns = gr.CheckboxGroup(
|
159 |
-
choices=[
|
160 |
-
c.name
|
161 |
-
for c in fields(AutoEvalColumn)
|
162 |
-
if not c.hidden and not c.never_hidden and not c.dummy
|
163 |
-
],
|
164 |
-
value=[
|
165 |
-
c.name
|
166 |
-
for c in fields(AutoEvalColumn)
|
167 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
168 |
-
],
|
169 |
-
label="Select columns to show",
|
170 |
-
elem_id="column-select",
|
171 |
-
interactive=True,
|
172 |
-
)
|
173 |
-
with gr.Row():
|
174 |
-
with gr.Column(min_width=250):
|
175 |
-
# with gr.Box(elem_id="box-filter"):
|
176 |
-
filter_columns_type = gr.CheckboxGroup(
|
177 |
-
label="Model types",
|
178 |
-
choices=[t.to_str() for t in ModelType],
|
179 |
-
value=[t.to_str() for t in ModelType],
|
180 |
-
interactive=True,
|
181 |
-
elem_id="filter-columns-type",
|
182 |
-
)
|
183 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
184 |
-
# label="Precision",
|
185 |
-
# choices=[i.value.name for i in Precision],
|
186 |
-
# value=[i.value.name for i in Precision],
|
187 |
-
# interactive=True,
|
188 |
-
# elem_id="filter-columns-precision",
|
189 |
-
# )
|
190 |
-
# filter_columns_size = gr.CheckboxGroup(
|
191 |
-
# label="Model sizes (in billions of parameters)",
|
192 |
-
# choices=list(NUMERIC_INTERVALS.keys()),
|
193 |
-
# value=list(NUMERIC_INTERVALS.keys()),
|
194 |
-
# interactive=True,
|
195 |
-
# elem_id="filter-columns-size",
|
196 |
-
# )
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
)
|
210 |
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
)
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
search_bar,
|
227 |
-
],
|
228 |
-
leaderboard_table,
|
229 |
)
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
hidden_leaderboard_table_for_search,
|
236 |
-
shown_columns,
|
237 |
-
filter_columns_type,
|
238 |
-
# filter_columns_precision,
|
239 |
-
# filter_columns_size,
|
240 |
-
# deleted_models_visibility,
|
241 |
-
search_bar,
|
242 |
-
],
|
243 |
-
leaderboard_table,
|
244 |
-
queue=True,
|
245 |
-
)
|
246 |
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
with gr.Column():
|
249 |
with gr.Row():
|
250 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
@@ -309,22 +503,24 @@ with demo:
|
|
309 |
],
|
310 |
submission_result,
|
311 |
)
|
312 |
-
with gr.Row():
|
313 |
-
with gr.Accordion("📖 FAQ", open=False):
|
314 |
-
with gr.Column(min_width=250):
|
315 |
-
gr.Markdown("""
|
316 |
-
#### What does N/A score mean?
|
317 |
-
|
318 |
-
An N/A score means that it was not possible to evaluate the benchmark for a given model.
|
319 |
|
320 |
-
|
321 |
-
|
322 |
-
- The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
|
323 |
-
- The model API refuses to provide any answer,
|
324 |
-
- We do not have access to the training data.
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
-
""")
|
328 |
with gr.Row():
|
329 |
with gr.Accordion("📙 Citation", open=False):
|
330 |
citation_button = gr.Textbox(
|
@@ -335,7 +531,7 @@ with demo:
|
|
335 |
show_copy_button=True,
|
336 |
)
|
337 |
|
338 |
-
scheduler = BackgroundScheduler()
|
339 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
340 |
-
scheduler.start()
|
341 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
+
import functools
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
|
|
6 |
from huggingface_hub import snapshot_download
|
7 |
|
8 |
from src.display.about import (
|
9 |
CITATION_BUTTON_LABEL,
|
10 |
CITATION_BUTTON_TEXT,
|
11 |
EVALUATION_QUEUE_TEXT,
|
|
|
|
|
12 |
TITLE,
|
13 |
)
|
14 |
from src.display.css_html_js import custom_css
|
|
|
25 |
WeightType,
|
26 |
Precision
|
27 |
)
|
28 |
+
from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO
|
29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
from src.submission.submit import add_new_eval
|
|
|
|
|
|
|
31 |
|
32 |
+
EVAL_RESULTS_PATH = str(Path(__file__).resolve().parent / "results")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
|
35 |
try:
|
|
|
38 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
39 |
)
|
40 |
except Exception:
|
41 |
+
# restart_space()
|
42 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
45 |
leaderboard_df = original_df.copy()
|
|
|
65 |
return df
|
66 |
|
67 |
|
68 |
+
def update_principles_table(
|
69 |
+
df,
|
70 |
+
*args: list,
|
71 |
+
) -> pd.DataFrame:
|
72 |
+
columns = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
73 |
+
for shown_column in args:
|
74 |
+
if isinstance(shown_column, gr.components.CheckboxGroup):
|
75 |
+
columns.extend(shown_column.value)
|
76 |
+
else:
|
77 |
+
columns.extend(shown_column)
|
78 |
+
|
79 |
+
# dummy column for querying (not shown)
|
80 |
+
columns.append("model_name_for_query")
|
81 |
+
return df[columns]
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
86 |
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
87 |
|
|
|
138 |
return filtered_df
|
139 |
|
140 |
|
141 |
+
BENCHMARKS_PER_CATEGORY = {
|
142 |
+
"Robustness and Predictability": [
|
143 |
+
"MMLU: Robustness",
|
144 |
+
"BoolQ Contrast Set",
|
145 |
+
"IMDB Contrast Set",
|
146 |
+
"Monotonicity Checks",
|
147 |
+
"Self-Check Consistency",
|
148 |
+
],
|
149 |
+
"Cyberattack Resilience": [
|
150 |
+
"Goal Hijacking and Prompt Leakage",
|
151 |
+
"Rule Following"
|
152 |
+
],
|
153 |
+
"Training Data Suitability": [
|
154 |
+
"Toxicity of the Dataset",
|
155 |
+
"Bias of the Dataset"
|
156 |
+
],
|
157 |
+
"No Copyright Infringement": [
|
158 |
+
"Copyrighted Material Memorization"
|
159 |
+
],
|
160 |
+
"User Privacy Protection": [
|
161 |
+
"PII Extraction by Association"
|
162 |
+
],
|
163 |
+
"Capabilities, Performance, and Limitations": [
|
164 |
+
"General Knowledge: MMLU",
|
165 |
+
"Reasoning: AI2 Reasoning Challenge",
|
166 |
+
"Common Sense Reasoning: HellaSwag",
|
167 |
+
"Truthfulness: TruthfulQA MC2",
|
168 |
+
"Coding: HumanEval"
|
169 |
+
],
|
170 |
+
"Interpretability": ["Logit Calibration: BIG-Bench", "Self-Assessment: TriviaQA"],
|
171 |
+
"Disclosure of AI": ["Denying Human Presence"],
|
172 |
+
"Traceability": ["Watermark Reliability & Robustness"],
|
173 |
+
"Representation — Absence of Bias": ["Representation Bias: RedditBias", "Prejudiced Answers: BBQ", "Biased Completions: BOLD"],
|
174 |
+
"Fairness — Absence of Discrimination":["Income Fairness: DecodingTrust", "Recommendation Consistency: FaiRLLM"],
|
175 |
+
"Harmful Content and Toxicity": ["Toxic Completions of Benign Text: RealToxicityPrompts", "Following Harmful Instructions: AdvBench"]
|
176 |
+
}
|
177 |
+
|
178 |
+
def _wrap_link(value: str, url: str) -> str:
|
179 |
+
return f"<a href={url} target='_blank'>{value}</a>"
|
180 |
+
|
181 |
+
TEXT_PER_CATEGORY = {
|
182 |
+
"Robustness and Predictability": f"We evaluate the model on state-of-the-art benchmarks that measure its robustness under various input alterations [{_wrap_link('1', 'https://aclanthology.org/2020.findings-emnlp.117/')}], and the level of consistency in its answers [{_wrap_link('2', 'https://arxiv.org/abs/2306.09983')}, {_wrap_link('3', 'https://arxiv.org/abs/2305.15852')}].",
|
183 |
+
"Cyberattack Resilience": f"We consider the concrete threats concerning just the LLM in isolation, focusing on its resilience to jailbreaks and prompt injection attacks [{_wrap_link('1', 'https://arxiv.org/abs/2311.01011')}, {_wrap_link('2', 'https://arxiv.org/abs/2311.04235')}, {_wrap_link('3', 'https://arxiv.org/abs/2312.02119')}].",
|
184 |
+
"Training Data Suitability": "We evaluate the adequacy of the dataset [1], aiming to assess the potential of an LLM trained on this data to exhibit toxic or discriminatory behavior.",
|
185 |
+
"No Copyright Infringement": "We check if the model can be made to directly regurgitate content that is subject to the copyright of a third person.",
|
186 |
+
"User Privacy Protection": "We focus on cases of user privacy violation by the LLM itself, evaluating the model’s ability to recover personal identifiable information that may have been included in the training data.",
|
187 |
+
"Capabilities, Performance, and Limitations": "To provide an overarching view, we assess the capabilities and limitations of the AI system by evaluating its performance on a wide range of tasks. We evaluate the model on widespread research benchmarks covering general knowledge [1], reasoning [2,3], truthfulness [4], and coding ability [5].",
|
188 |
+
"Interpretability": "The large body of machine learning interpretability research is often not easily applicable to large language models. While more work in this direction is needed, we use the existing easily-applicable methods to evaluate the model’s ability to reason about its own correctness [1], and the degree to which the probabilities it outputs can be interpreted [3,4].",
|
189 |
+
"Disclosure of AI": "We require the language model to consistently deny that it is a human.",
|
190 |
+
"Traceability": "We require the presence of language model watermarking [1,2], and evaluate its viability, combining several important requirements that such schemes must satisfy to be practical.",
|
191 |
+
"Representation — Absence of Bias": "We evaluate the tendency of the LLM to produce biased outputs, on three popular bias benchmarks [1,2,3].",
|
192 |
+
"Fairness — Absence of Discrimination": "We evaluate the model’s tendency to behave in a discriminatory way by comparing its behavior on different protected groups, using prominent fairness benchmarks [1,2].",
|
193 |
+
"Harmful Content and Toxicity": "We evaluate the models’ tendency to produce harmful or toxic content, leveraging two recent evaluation tools, RealToxicityPrompts and AdvBench [1,2]."
|
194 |
+
}
|
195 |
+
|
196 |
+
CATEGORIES_PER_PRINCIPLE = {
|
197 |
+
"Technical Robustness and Safety": ["Robustness and Predictability", "Cyberattack Resilience"],
|
198 |
+
"Privacy & Data Governance": ["Training Data Suitability", "No Copyright Infringement", "User Privacy Protection"],
|
199 |
+
"Transparency": ["Capabilities, Performance, and Limitations", "Interpretability", "Disclosure of AI", "Traceability"],
|
200 |
+
"Diversity, Non-discrimination & Fairness": ["Representation — Absence of Bias", "Fairness — Absence of Discrimination"],
|
201 |
+
"Social & Environmental Well-being": ["Harmful Content and Toxicity"]
|
202 |
+
}
|
203 |
+
|
204 |
+
ICON_PER_PRINCIPLE = {
|
205 |
+
"Technical Robustness and Safety": "https://compl-ai.org/icon_technical_robustness_and_safety.svg",
|
206 |
+
"Privacy & Data Governance": "https://compl-ai.org/icon_privacy_and_data_governance.svg",
|
207 |
+
"Transparency": "https://compl-ai.org/icon_transparency.svg",
|
208 |
+
"Diversity, Non-discrimination & Fairness": "https://compl-ai.org/icon_diversity_fairness.svg",
|
209 |
+
"Social & Environmental Well-being": "https://compl-ai.org/icon_social_environmental.svg",
|
210 |
+
}
|
211 |
+
|
212 |
+
def generate_benchmarks(principle: str):
|
213 |
+
with gr.Row():
|
214 |
+
gr.HTML(f"""
|
215 |
+
<h3 class="image_header principle_header"><img src="{ICON_PER_PRINCIPLE[principle]}" class="principle_icon"/>EU AI Act Principle: {principle}</h3>
|
216 |
+
""")
|
217 |
|
218 |
+
categories = CATEGORIES_PER_PRINCIPLE[principle]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
+
with gr.Row(elem_classes=["technical_requirements", "border_mid"]):
|
221 |
+
for category in categories:
|
222 |
+
with gr.Column():
|
223 |
+
gr.HTML(
|
224 |
+
f"""
|
225 |
+
<div style="padding: 10px 20px;">
|
226 |
+
<h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />{category}</h3>
|
227 |
+
<p>{TEXT_PER_CATEGORY[category]}</p>
|
228 |
+
</div>
|
229 |
+
"""
|
230 |
+
)
|
231 |
+
|
232 |
+
shown_columns = []
|
233 |
+
with gr.Row(elem_classes=["technical_requirements", "border_bot"]):
|
234 |
+
for category in categories:
|
235 |
+
with gr.Column():
|
236 |
+
shown_column = gr.CheckboxGroup(
|
237 |
+
show_label=False,
|
238 |
+
choices=BENCHMARKS_PER_CATEGORY[category],
|
239 |
+
value=BENCHMARKS_PER_CATEGORY[category],
|
240 |
+
interactive=True,
|
241 |
+
# elem_id="filter-columns-type",
|
242 |
+
)
|
243 |
+
shown_columns.append(shown_column)
|
244 |
+
|
245 |
+
|
246 |
+
with gr.Row():
|
247 |
+
df = update_principles_table(leaderboard_df, *shown_columns)
|
248 |
+
type_per_column = {c.name: c.type for c in fields(AutoEvalColumn)}
|
249 |
+
datatypes = [type_per_column[name] for name in df.columns]
|
250 |
+
leaderboard_table = gr.components.Dataframe(
|
251 |
+
value=df,
|
252 |
+
headers=df.columns.tolist(),
|
253 |
+
datatype=datatypes,
|
254 |
+
elem_id="leaderboard-table",
|
255 |
+
interactive=False,
|
256 |
+
visible=True,
|
257 |
+
)
|
258 |
+
|
259 |
+
for shown_column in shown_columns:
|
260 |
+
shown_column.change(
|
261 |
+
fn=functools.partial(update_principles_table, leaderboard_df),
|
262 |
+
inputs=shown_columns,
|
263 |
+
outputs=leaderboard_table,
|
264 |
+
# queue=True,
|
265 |
)
|
266 |
|
267 |
+
# Allows clicking on the full table column to trigger sorting
|
268 |
+
custom_js = """
|
269 |
+
function clickableTableHeaders() {
|
270 |
+
document.querySelectorAll(".table > thead > tr > th").forEach(th => {
|
271 |
+
th.addEventListener("click", () => {
|
272 |
+
const sortButton = th.querySelector(".sort-button"); // Selects the first child with class "sort-button"
|
273 |
+
if (sortButton) {
|
274 |
+
sortButton.click(); // Triggers the click event on the "sort-button" element
|
275 |
+
}
|
276 |
+
});
|
277 |
+
});
|
278 |
+
|
279 |
+
// Select all elements with the .table class
|
280 |
+
const tableElements = document.querySelectorAll('.table');
|
281 |
+
|
282 |
+
// Callback function to execute when mutations are observed
|
283 |
+
const mutationCallback = (mutationsList) => {
|
284 |
+
mutationsList.forEach((mutation) => {
|
285 |
+
if (mutation.target.nodeName == "TH" && mutation.addedNodes.length > 0) {
|
286 |
+
mutation.target.addEventListener("click", () => {
|
287 |
+
const sortButton = mutation.target.querySelector(".sort-button"); // Selects the first child with class "sort-button"
|
288 |
+
if (sortButton) {
|
289 |
+
sortButton.click(); // Triggers the click event on the "sort-button" element
|
290 |
+
}
|
291 |
+
});
|
292 |
+
}
|
293 |
+
});
|
294 |
+
};
|
295 |
+
|
296 |
+
// Options for the observer (which mutations to observe)
|
297 |
+
const observerOptions = {
|
298 |
+
childList: true, // Watch for additions/removals of child nodes
|
299 |
+
subtree: true // Watch for changes in descendants as well
|
300 |
+
};
|
301 |
+
|
302 |
+
// Create an instance of MutationObserver and pass in the callback function
|
303 |
+
const observer = new MutationObserver(mutationCallback);
|
304 |
+
|
305 |
+
// Observe each .table element
|
306 |
+
tableElements.forEach((tableElement) => {
|
307 |
+
observer.observe(tableElement, observerOptions);
|
308 |
+
});
|
309 |
+
}
|
310 |
+
"""
|
311 |
+
|
312 |
+
demo = gr.Blocks(
|
313 |
+
css=custom_css,
|
314 |
+
theme=gr.themes.Default(
|
315 |
+
font=gr.themes.GoogleFont("Open Sans", weights=(400, 500, 600))
|
316 |
+
),
|
317 |
+
js=custom_js,
|
318 |
+
)
|
319 |
+
|
320 |
+
with demo:
|
321 |
+
gr.HTML(TITLE)
|
322 |
+
|
323 |
+
with gr.Row(elem_id="intro"):
|
324 |
+
with gr.Column(scale=1, min_width=20, elem_classes="empty"):
|
325 |
+
pass
|
326 |
+
with gr.Column(scale=5):
|
327 |
+
gr.HTML(
|
328 |
+
"""
|
329 |
+
<h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />Technical Interpretation of the EU AI Act</h3>
|
330 |
+
<p>We have interpreted the high-level regulatory requirements of the EU AI Act as concrete technical requirements. We further group requirements within six EU AI Act principles and label them as GPAI, GPAI+SR (Systemic Risk), and HR (High-Risk).</p>
|
331 |
+
<br/>
|
332 |
+
<a href="https://compl-ai.org/interpretation" class="button" target="_blank">Explore the Interpretation</a>
|
333 |
+
"""
|
334 |
)
|
335 |
+
with gr.Column(scale=5):
|
336 |
+
gr.HTML(
|
337 |
+
"""
|
338 |
+
<h3 class="image_header"><img src="https://compl-ai.org/checkmark.png" style="max-height:24px;" />Open-Source Benchmarking Suite</h3>
|
339 |
+
<p>The framework includes the ability to evaluate the technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open to community contributions.</p>
|
340 |
+
<br/>
|
341 |
+
<a href="https://github.com/compl-ai/compl-ai" class="button" target="_blank"><img src="https://compl-ai.org/icons/github-mark.svg" class="github_icon">GitHub Repo</a>
|
342 |
+
"""
|
|
|
|
|
|
|
343 |
)
|
344 |
+
with gr.Column(scale=1, min_width=20, elem_classes="empty"):
|
345 |
+
pass
|
346 |
+
|
347 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
348 |
+
with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
|
350 |
+
for principle in CATEGORIES_PER_PRINCIPLE.keys():
|
351 |
+
generate_benchmarks(principle)
|
352 |
+
|
353 |
+
###
|
354 |
+
|
355 |
+
# with gr.Row():
|
356 |
+
# shown_columns = gr.CheckboxGroup(
|
357 |
+
# choices=[
|
358 |
+
# c.name
|
359 |
+
# for c in fields(AutoEvalColumn)
|
360 |
+
# if not c.hidden and not c.never_hidden and not c.dummy
|
361 |
+
# ],
|
362 |
+
# value=[
|
363 |
+
# c.name
|
364 |
+
# for c in fields(AutoEvalColumn)
|
365 |
+
# if c.displayed_by_default and not c.hidden and not c.never_hidden
|
366 |
+
# ],
|
367 |
+
# label="Select columns to show",
|
368 |
+
# elem_id="column-select",
|
369 |
+
# interactive=True,
|
370 |
+
# )
|
371 |
+
#
|
372 |
+
# with gr.Row():
|
373 |
+
# # with gr.Box(elem_id="box-filter"):
|
374 |
+
# filter_columns_type = gr.CheckboxGroup(
|
375 |
+
# label="Model types",
|
376 |
+
# choices=[t.to_str() for t in ModelType],
|
377 |
+
# value=[t.to_str() for t in ModelType],
|
378 |
+
# interactive=True,
|
379 |
+
# elem_id="filter-columns-type",
|
380 |
+
# )
|
381 |
+
#
|
382 |
+
# with gr.Row():
|
383 |
+
# search_bar = gr.Textbox(
|
384 |
+
# placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
385 |
+
# show_label=False,
|
386 |
+
# elem_id="search-bar",
|
387 |
+
# )
|
388 |
+
# # x = gr.Checkbox(show_label=False, label="foo")
|
389 |
+
#
|
390 |
+
# with gr.Row():
|
391 |
+
# # print(shown_columns.value)
|
392 |
+
# leaderboard_table = gr.components.Dataframe(
|
393 |
+
# value=leaderboard_df[
|
394 |
+
# [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
395 |
+
# + shown_columns.value
|
396 |
+
# ],
|
397 |
+
# headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
398 |
+
# datatype=TYPES,
|
399 |
+
# elem_id="leaderboard-table",
|
400 |
+
# interactive=False,
|
401 |
+
# visible=True,
|
402 |
+
# # column_widths=["2%", "30%", "10%", "10%", "12%"]
|
403 |
+
# )
|
404 |
+
#
|
405 |
+
# # Dummy leaderboard for handling the case when the user uses backspace key
|
406 |
+
# hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
407 |
+
# value=original_df[COLS],
|
408 |
+
# headers=COLS,
|
409 |
+
# datatype=TYPES,
|
410 |
+
# visible=False,
|
411 |
+
# )
|
412 |
+
# search_bar.submit(
|
413 |
+
# update_table,
|
414 |
+
# [
|
415 |
+
# hidden_leaderboard_table_for_search,
|
416 |
+
# shown_columns,
|
417 |
+
# filter_columns_type,
|
418 |
+
# # filter_columns_precision,
|
419 |
+
# # filter_columns_size,
|
420 |
+
# search_bar,
|
421 |
+
# ],
|
422 |
+
# leaderboard_table,
|
423 |
+
# )
|
424 |
+
# for selector in [shown_columns, filter_columns_type,
|
425 |
+
# ]:
|
426 |
+
# selector.change(
|
427 |
+
# update_table,
|
428 |
+
# [
|
429 |
+
# hidden_leaderboard_table_for_search,
|
430 |
+
# shown_columns,
|
431 |
+
# filter_columns_type,
|
432 |
+
# # filter_columns_precision,
|
433 |
+
# # filter_columns_size,
|
434 |
+
# # deleted_models_visibility,
|
435 |
+
# search_bar,
|
436 |
+
# ],
|
437 |
+
# leaderboard_table,
|
438 |
+
# queue=True,
|
439 |
+
# )
|
440 |
+
|
441 |
+
with gr.TabItem("🚀 Request Evaluation ", elem_id="llm-benchmark-tab-table", id=3):
|
442 |
with gr.Column():
|
443 |
with gr.Row():
|
444 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
503 |
],
|
504 |
submission_result,
|
505 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
|
507 |
+
with gr.TabItem("📖 FAQ ", elem_id="llm-benchmark-tab-table", id=4):
|
|
|
|
|
|
|
|
|
508 |
|
509 |
+
with gr.Row():
|
510 |
+
# with gr.Accordion("📖 FAQ", open=True):
|
511 |
+
# with gr.Column(min_width=250):
|
512 |
+
gr.Markdown("""
|
513 |
+
#### What does N/A score mean?
|
514 |
+
|
515 |
+
An N/A score means that it was not possible to evaluate the benchmark for a given model.
|
516 |
+
|
517 |
+
This can happen for multiple reasons, such as:
|
518 |
+
|
519 |
+
- The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
|
520 |
+
- The model API refuses to provide any answer,
|
521 |
+
- We do not have access to the training data. """
|
522 |
+
)
|
523 |
|
|
|
524 |
with gr.Row():
|
525 |
with gr.Accordion("📙 Citation", open=False):
|
526 |
citation_button = gr.Textbox(
|
|
|
531 |
show_copy_button=True,
|
532 |
)
|
533 |
|
534 |
+
# scheduler = BackgroundScheduler()
|
535 |
+
# scheduler.add_job(restart_space, "interval", seconds=1800)
|
536 |
+
# scheduler.start()
|
537 |
demo.queue(default_concurrency_limit=40).launch()
|
requirements.txt
CHANGED
@@ -2,8 +2,8 @@ APScheduler==3.10.1
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
-
gradio==
|
6 |
-
gradio_client==
|
7 |
huggingface-hub>=0.18.0
|
8 |
matplotlib==3.7.1
|
9 |
numpy==1.24.2
|
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
+
gradio==5.4.0
|
6 |
+
gradio_client==1.4.2
|
7 |
huggingface-hub>=0.18.0
|
8 |
matplotlib==3.7.1
|
9 |
numpy==1.24.2
|
results/01-ai/Yi-34B-Chat.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "01-ai/Yi-34B-Chat",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Yi-34B-Chat"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9307063195147172
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.6834721511354611
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.921922202494338
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9927596254738791
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.6688888888888889
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.8031496062992126
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.5666666666666667
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.84
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.89125
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.8039534592768672
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 1.0
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.8403704441346346
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.32298136645962733
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.538659793814433
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.5828559926386013
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": 0.6196885060946251
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": 0.5538416743593192
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.7496795328300812
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.6544368600682594
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.3561643835616438
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.989
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.23216608444613182
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.7225454545454546
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/Claude3Opus/result_Claude3Opus.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "Claude3Opus",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/claude-3-opus-20240229"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9660153175746249
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.757401370038459
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": null
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9932693207159621
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.7785714285714285
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.8484107579462102
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": null
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": null
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": null
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": null
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.825
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.954
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.848
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.8402061855670103
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.7557223372440763
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": null
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": null
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.868
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.964
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 1.0
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 1.0
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.18613375956573336
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": null
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/Qwen/Qwen1.5-72B-Chat.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "Qwen/Qwen1.5-72B-Chat",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Qwen1.5-72B-Chat"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9599664585022551
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.7200614975210675
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.9695524579515152
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9976900071559403
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.6695121951219511
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.8449367088607596
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.8
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.48
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.78625
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.42761166020267727
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.6800000000000002
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.8662617008564031
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.5726708074534161
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.45360824742268047
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.48560501495284103
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": 0.8261042236875162
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": 0.6389413688457826
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.7768836348098561
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.6825938566552902
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.726027397260274
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.988
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.06008018267012608
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.7525454545454544
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/google/gemma-2-9b.json
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "google/gemma-2-9b",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Google-gemma-2-9b"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9839603777871013
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.7370530984096585
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.8414627564046196
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9254440618284906
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.46973684210526323
|
22 |
+
},
|
23 |
+
"boolq_contrast_robustness": {
|
24 |
+
"aggregate_score": 0.5833333333333334
|
25 |
+
},
|
26 |
+
"imdb_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.51
|
28 |
+
},
|
29 |
+
"calibration_big_bench": {
|
30 |
+
"aggregate_score": 0.80375
|
31 |
+
},
|
32 |
+
"calibration_big_bench_i_know": {
|
33 |
+
"aggregate_score": 0.013649443262854999
|
34 |
+
},
|
35 |
+
"decoding_trust": {
|
36 |
+
"aggregate_score": 0.94
|
37 |
+
},
|
38 |
+
"hellaswag": {
|
39 |
+
"aggregate_score": 0.35590519816769567
|
40 |
+
},
|
41 |
+
"human_eval": {
|
42 |
+
"aggregate_score": 0.0
|
43 |
+
},
|
44 |
+
"instruction_goal_hijacking": {
|
45 |
+
"aggregate_score": 0.4858247422680413
|
46 |
+
},
|
47 |
+
"multiturn_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.499626178974005
|
49 |
+
},
|
50 |
+
"reddit_bias": {
|
51 |
+
"aggregate_score": 0.9823398039376345
|
52 |
+
},
|
53 |
+
"truthful_qa_mc2": {
|
54 |
+
"aggregate_score": 0.4643375169376196
|
55 |
+
},
|
56 |
+
"mmlu": {
|
57 |
+
"aggregate_score": 0.26356644352656317
|
58 |
+
},
|
59 |
+
"ai2_reasoning": {
|
60 |
+
"aggregate_score": 0.2295221843003413
|
61 |
+
},
|
62 |
+
"human_deception": {
|
63 |
+
"aggregate_score": 0.9863013698630136
|
64 |
+
},
|
65 |
+
"memorization": {
|
66 |
+
"aggregate_score": 0.996
|
67 |
+
},
|
68 |
+
"privacy": {
|
69 |
+
"aggregate_score": 1.0
|
70 |
+
},
|
71 |
+
"fairllm": {
|
72 |
+
"aggregate_score": 0.0
|
73 |
+
},
|
74 |
+
"mmlu_robustness": {
|
75 |
+
"aggregate_score": 0.26527272727272727
|
76 |
+
},
|
77 |
+
"training_data_suitability": {
|
78 |
+
"aggregate_score": null
|
79 |
+
}
|
80 |
+
}
|
81 |
+
}
|
results/gpt-3.5-turbo-0125/result_gpt-3.5-turbo-0125.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "gpt-3.5-turbo-0125",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/gpt-3.5-turbo-0125"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.8798721047562755
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.7320262917581274
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.9388454625027057
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9905250918854458
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.6739999999999999
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.8208955223880597
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.65
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.545
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.95625
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.9075691229826097
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.835
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.855
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.762
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": null
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.6549516908212559
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": null
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": null
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.6811707733941034
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.930976430976431
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.589041095890411
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.992
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.07798395569162107
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.6952727272727273
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/gpt-4-1106-preview/result_gpt-4-1106-preview.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "gpt-4-1106-preview",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/gpt-4-1106-preview"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9825415205108432
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.7385998936629464
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.969292480968859
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.996174184883085
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.7755813953488372
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.8656716417910448
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.8666666666666667
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.985
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 1.0
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.9538095833854623
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.8799999999999999
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.953
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.8370000000000001
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.6572164948453608
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.882740970784449
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": null
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": null
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.805
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.964
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.9726027397260274
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.998
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.12940239570245562
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.8049090909090908
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/meta-llama/Llama-2-13b-chat-hf.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "meta-llama/Llama-2-13b-chat-hf",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Llama-2-13b-chat-hf"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9296858841579962
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.7190075716540155
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.963705740028952
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9990298427566393
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.5722222222222222
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.7451923076923077
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.25
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.4
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.84625
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.7748780182659827
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.91
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.8240390360485959
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.20621118012422357
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.41752577319587625
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.3651598803772717
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": 0.7413109757904534
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": 0.43962155328662317
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.5351089588377724
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.5938566552901023
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 1.0
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.989
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.14127747715247715
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.5076363636363636
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/meta-llama/Llama-2-70b-chat-hf.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "meta-llama/Llama-2-70b-chat-hf",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Llama-2-70b-chat-hf"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9498141943726044
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.7252454731129678
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.9409038541427155
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9988042732774585
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.7271428571428571
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.7466666666666666
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.7166666666666667
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.42
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.8825
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.8323999999999999
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.85
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.8578968333001394
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.31242236024844716
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.4278350515463918
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.38218886588451806
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": 0.352451665752182
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": 0.5276263233531145
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.6320324740065518
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.6476109215017065
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.8904109589041096
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.988
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.41367733201077794
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.6018181818181818
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/meta-llama/Llama-2-7b-chat-hf.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "meta-llama/Llama-2-7b-chat-hf",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Llama-2-7b-chat-hf"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9548491021218211
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.6798469370104644
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.9646512368479587
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9990993229758505
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.6043478260869565
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.6666666666666667
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.2833333333333333
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.48
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.865
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.7370949580883273
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.9299999999999999
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.7868950408285202
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.14596273291925466
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.5141752577319587
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.26991028295376124
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": 0.32916275270076745
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": 0.4532195083848858
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.4721549636803874
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.5452218430034129
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.9315068493150684
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.992
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.08752953475948229
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.44909090909090915
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/mistralai/Mistral-7B-Instruct-v0.2.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "mistralai/Mistral-7B-Instruct-v0.2",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.2"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.9040460470649646
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.747184860969805
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.9607862783311598
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9940165055485872
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.5797872340425532
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.6974789915966386
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.2833333333333333
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.12
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.93375
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.6862379582134368
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 1.0
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.8477394941246763
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.4018633540372671
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.3118556701030928
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.23205659075224297
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": 0.659153609956636
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": 0.6681153383230657
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.5910838911835921
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.6407849829351536
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.9863013698630136
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.988
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.019112659939765738
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.5776363636363636
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
results/mistralai/Mixtral-8x7B-Instruct-v0.1.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"config": {
|
3 |
+
"model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
4 |
+
"model_sha": "main",
|
5 |
+
"model_report": "https://compl-ai.org/evaluations/Mixtral-8x7B-Instruct-v0.1"
|
6 |
+
},
|
7 |
+
"results": {
|
8 |
+
"bbq": {
|
9 |
+
"aggregate_score": 0.949723812859033
|
10 |
+
},
|
11 |
+
"bold": {
|
12 |
+
"aggregate_score": 0.734901669263283
|
13 |
+
},
|
14 |
+
"toxicity": {
|
15 |
+
"aggregate_score": 0.9581340294360416
|
16 |
+
},
|
17 |
+
"toxicity_advbench": {
|
18 |
+
"aggregate_score": 0.9932040768299395
|
19 |
+
},
|
20 |
+
"forecasting_consistency": {
|
21 |
+
"aggregate_score": 0.6377777777777777
|
22 |
+
},
|
23 |
+
"self_check_consistency": {
|
24 |
+
"aggregate_score": 0.7877094972067039
|
25 |
+
},
|
26 |
+
"boolq_contrast_robustness": {
|
27 |
+
"aggregate_score": 0.35
|
28 |
+
},
|
29 |
+
"imdb_contrast_robustness": {
|
30 |
+
"aggregate_score": 0.47
|
31 |
+
},
|
32 |
+
"calibration_big_bench": {
|
33 |
+
"aggregate_score": 0.9037499999999999
|
34 |
+
},
|
35 |
+
"calibration_big_bench_i_know": {
|
36 |
+
"aggregate_score": 0.8543725760040035
|
37 |
+
},
|
38 |
+
"decoding_trust": {
|
39 |
+
"aggregate_score": 0.9299999999999999
|
40 |
+
},
|
41 |
+
"hellaswag": {
|
42 |
+
"aggregate_score": 0.8755228042222665
|
43 |
+
},
|
44 |
+
"human_eval": {
|
45 |
+
"aggregate_score": 0.475776397515528
|
46 |
+
},
|
47 |
+
"instruction_goal_hijacking": {
|
48 |
+
"aggregate_score": 0.375
|
49 |
+
},
|
50 |
+
"multiturn_goal_hijacking": {
|
51 |
+
"aggregate_score": 0.2561249137336094
|
52 |
+
},
|
53 |
+
"reddit_bias": {
|
54 |
+
"aggregate_score": 0.5429049297532
|
55 |
+
},
|
56 |
+
"truthful_qa_mc2": {
|
57 |
+
"aggregate_score": 0.6458557121081614
|
58 |
+
},
|
59 |
+
"mmlu": {
|
60 |
+
"aggregate_score": 0.7031761857285287
|
61 |
+
},
|
62 |
+
"ai2_reasoning": {
|
63 |
+
"aggregate_score": 0.7090443686006825
|
64 |
+
},
|
65 |
+
"human_deception": {
|
66 |
+
"aggregate_score": 0.8904109589041096
|
67 |
+
},
|
68 |
+
"memorization": {
|
69 |
+
"aggregate_score": 0.983
|
70 |
+
},
|
71 |
+
"privacy": {
|
72 |
+
"aggregate_score": 1.0
|
73 |
+
},
|
74 |
+
"fairllm": {
|
75 |
+
"aggregate_score": 0.05770831155558887
|
76 |
+
},
|
77 |
+
"mmlu_robustness": {
|
78 |
+
"aggregate_score": 0.6941818181818181
|
79 |
+
},
|
80 |
+
"training_data_suitability": {
|
81 |
+
"aggregate_score": null
|
82 |
+
},
|
83 |
+
"watermarking": {
|
84 |
+
"aggregate_score": null
|
85 |
+
}
|
86 |
+
}
|
87 |
+
}
|
src/display/about.py
CHANGED
@@ -15,7 +15,7 @@ class Tasks(Enum):
|
|
15 |
task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
|
16 |
task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
|
17 |
task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
|
18 |
-
task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks
|
19 |
task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
|
20 |
task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
|
21 |
task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
|
@@ -24,8 +24,8 @@ class Tasks(Enum):
|
|
24 |
task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
|
25 |
task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
|
26 |
task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
|
27 |
-
task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage
|
28 |
-
task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following
|
29 |
task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
|
30 |
task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
|
31 |
task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
|
@@ -35,8 +35,10 @@ class Tasks(Enum):
|
|
35 |
task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
|
36 |
task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
|
37 |
task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
|
38 |
-
task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
|
39 |
-
|
|
|
|
|
40 |
|
41 |
|
42 |
|
@@ -44,9 +46,6 @@ class Tasks(Enum):
|
|
44 |
# Your leaderboard name
|
45 |
TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
|
46 |
|
47 |
-
# What does your leaderboard evaluate?
|
48 |
-
INTRODUCTION_TEXT = """<p style="font-size: 16px;">COMPL-AI is an open-source compliance-centered evaluation framework for Generative AI models. It includes the ability to evaluate the regulatory technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open-source and open to community contributions. For more information, please visit <a href="https://compl-ai.org" target="_blank">compl-ai.org</a>.</p>"""
|
49 |
-
|
50 |
# Which evaluations are you running? how can people reproduce what you have?
|
51 |
LLM_BENCHMARKS_TEXT = f"""
|
52 |
"""
|
|
|
15 |
task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
|
16 |
task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
|
17 |
task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
|
18 |
+
task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks")
|
19 |
task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
|
20 |
task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
|
21 |
task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
|
|
|
24 |
task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
|
25 |
task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
|
26 |
task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
|
27 |
+
task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage")
|
28 |
+
task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following")
|
29 |
task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
|
30 |
task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
|
31 |
task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
|
|
|
35 |
task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
|
36 |
task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
|
37 |
task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
|
38 |
+
# task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
|
39 |
+
task24 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness")
|
40 |
+
task25 = Task("dataset_bias", "aggregate_score", "Bias of the Dataset")
|
41 |
+
task26 = Task("dataset_toxicity", "aggregate_score", "Toxicity of the Dataset")
|
42 |
|
43 |
|
44 |
|
|
|
46 |
# Your leaderboard name
|
47 |
TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
|
48 |
|
|
|
|
|
|
|
49 |
# Which evaluations are you running? how can people reproduce what you have?
|
50 |
LLM_BENCHMARKS_TEXT = f"""
|
51 |
"""
|
src/display/css_html_js.py
CHANGED
@@ -1,4 +1,11 @@
|
|
1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
/* Hides the final AutoEvalColumn */
|
3 |
#llm-benchmark-tab-table table td:last-child,
|
4 |
#llm-benchmark-tab-table table th:last-child {
|
@@ -21,6 +28,8 @@ table {
|
|
21 |
/* Full width space */
|
22 |
.gradio-container {
|
23 |
max-width: 95%!important;
|
|
|
|
|
24 |
}
|
25 |
|
26 |
/* Text style and margins */
|
@@ -51,6 +60,14 @@ table {
|
|
51 |
.tab-buttons button {
|
52 |
font-size: 20px;
|
53 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
/* Filters style */
|
56 |
#filter_type{
|
@@ -86,6 +103,153 @@ table {
|
|
86 |
border: 0
|
87 |
}
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
"""
|
90 |
|
91 |
get_window_url_params = """
|
|
|
1 |
custom_css = """
|
2 |
+
|
3 |
+
:root {
|
4 |
+
--block-radius: 0px !important;
|
5 |
+
--table-radius: 0px !important;
|
6 |
+
--input-radius: 0px !important;
|
7 |
+
}
|
8 |
+
|
9 |
/* Hides the final AutoEvalColumn */
|
10 |
#llm-benchmark-tab-table table td:last-child,
|
11 |
#llm-benchmark-tab-table table th:last-child {
|
|
|
28 |
/* Full width space */
|
29 |
.gradio-container {
|
30 |
max-width: 95%!important;
|
31 |
+
font-family: Open Sans,sans-serif;
|
32 |
+
line-height: 1.75em !important;
|
33 |
}
|
34 |
|
35 |
/* Text style and margins */
|
|
|
60 |
.tab-buttons button {
|
61 |
font-size: 20px;
|
62 |
}
|
63 |
+
.tab-buttons {
|
64 |
+
padding-top: 40px;
|
65 |
+
}
|
66 |
+
|
67 |
+
/* Center Tabs */
|
68 |
+
.tab-buttons > div > div:nth-child(2) {
|
69 |
+
justify-content: center;
|
70 |
+
}
|
71 |
|
72 |
/* Filters style */
|
73 |
#filter_type{
|
|
|
103 |
border: 0
|
104 |
}
|
105 |
|
106 |
+
#intro {
|
107 |
+
padding: 40px 0;
|
108 |
+
border: 1px solid var(--border-color-primary);
|
109 |
+
}
|
110 |
+
|
111 |
+
#intro > div {
|
112 |
+
padding-left: 2em;
|
113 |
+
padding-right: 2em;
|
114 |
+
min-width: 0px !important;
|
115 |
+
}
|
116 |
+
|
117 |
+
.image_header {
|
118 |
+
display: flex;
|
119 |
+
gap: 15px;
|
120 |
+
align-items: center;
|
121 |
+
}
|
122 |
+
|
123 |
+
|
124 |
+
p {
|
125 |
+
font-weight: 400;
|
126 |
+
font-style: normal;
|
127 |
+
font-size: 14px;
|
128 |
+
line-height: 1.75em !important;
|
129 |
+
}
|
130 |
+
|
131 |
+
.button {
|
132 |
+
border: 1px solid #174DA3;
|
133 |
+
font-family: IBM Plex Mono,monospace;
|
134 |
+
background: none;
|
135 |
+
padding: 5px 15px;
|
136 |
+
color: #174DA3 !important;
|
137 |
+
position: relative;
|
138 |
+
font-size: 14px;
|
139 |
+
font-weight: 500;
|
140 |
+
transition: background-color .15s ease;
|
141 |
+
display: inline-flex;
|
142 |
+
align-items: center;
|
143 |
+
text-decoration: none !important;
|
144 |
+
line-height: 1.75em !important;
|
145 |
+
}
|
146 |
+
|
147 |
+
.button:hover {
|
148 |
+
cursor: pointer;
|
149 |
+
background: #EBEEF4;
|
150 |
+
}
|
151 |
+
|
152 |
+
#llm-benchmark-tab-table-button {
|
153 |
+
border-top-right-radius: unset !important;
|
154 |
+
border-top-left-radius: unset !important;
|
155 |
+
font-size: 18px !important;
|
156 |
+
font-weight: 500 !important;
|
157 |
+
}
|
158 |
+
|
159 |
+
label {
|
160 |
+
background: unset !important;
|
161 |
+
border-radius: 0 !important;
|
162 |
+
box-shadow: unset !important;
|
163 |
+
}
|
164 |
+
|
165 |
+
label > input {
|
166 |
+
border-radius: 0 !important;
|
167 |
+
}
|
168 |
+
|
169 |
+
form {
|
170 |
+
border-radius: 0 !important;
|
171 |
+
}
|
172 |
+
|
173 |
+
.principle_header {
|
174 |
+
padding: 10px 20px;
|
175 |
+
background-color: #EBEEF4;
|
176 |
+
border: 1px solid var(--border-color-primary);
|
177 |
+
}
|
178 |
+
|
179 |
+
.technical_requirements {
|
180 |
+
margin-top: -17px;
|
181 |
+
gap: 0px;
|
182 |
+
align-items: stretch;
|
183 |
+
}
|
184 |
+
|
185 |
+
.technical_requirements > div {
|
186 |
+
gap: 0px;
|
187 |
+
|
188 |
+
}
|
189 |
+
|
190 |
+
.technical_requirements > div > div.form {
|
191 |
+
border: unset !important;
|
192 |
+
}
|
193 |
+
|
194 |
+
.border_mid > div {
|
195 |
+
border-left: 1px solid var(--border-color-primary);
|
196 |
+
border-right: 1px solid var(--border-color-primary);
|
197 |
+
}
|
198 |
+
|
199 |
+
.border_bot > div {
|
200 |
+
border-left: 1px solid var(--border-color-primary);
|
201 |
+
border-right: 1px solid var(--border-color-primary);
|
202 |
+
border-bottom: 1px solid var(--border-color-primary);
|
203 |
+
}
|
204 |
+
|
205 |
+
@media only screen and (max-width: 1200px) {
|
206 |
+
.empty {
|
207 |
+
visibility: hidden;
|
208 |
+
display: none;
|
209 |
+
}
|
210 |
+
|
211 |
+
}
|
212 |
+
|
213 |
+
@media only screen and (max-width: 800px) {
|
214 |
+
.empty {
|
215 |
+
visibility: hidden;
|
216 |
+
display: none;
|
217 |
+
}
|
218 |
+
|
219 |
+
#intro {
|
220 |
+
flex-direction: column;
|
221 |
+
gap: 48px;
|
222 |
+
}
|
223 |
+
}
|
224 |
+
|
225 |
+
.principle_icon {
|
226 |
+
max-height:24px;
|
227 |
+
}
|
228 |
+
|
229 |
+
.github_icon {
|
230 |
+
max-height:24px;
|
231 |
+
padding-right: 1em;
|
232 |
+
}
|
233 |
+
|
234 |
+
@media (prefers-color-scheme: dark) {
|
235 |
+
.principle_header {
|
236 |
+
background-color: var(--block-background-fill);
|
237 |
+
}
|
238 |
+
|
239 |
+
.button {
|
240 |
+
border: 1px solid var(--color-accent);
|
241 |
+
color: var(--color-accent) !important;
|
242 |
+
}
|
243 |
+
|
244 |
+
.principle_icon {
|
245 |
+
filter: brightness(2);
|
246 |
+
}
|
247 |
+
|
248 |
+
.github_icon {
|
249 |
+
filter: brightness(2);
|
250 |
+
}
|
251 |
+
}
|
252 |
+
|
253 |
"""
|
254 |
|
255 |
get_window_url_params = """
|
src/display/utils.py
CHANGED
@@ -26,7 +26,7 @@ class ColumnContent:
|
|
26 |
## Leaderboard columns
|
27 |
auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
|
28 |
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
|
29 |
-
["model_report", ColumnContent, ColumnContent("
|
30 |
]
|
31 |
# Init
|
32 |
# Scores
|
|
|
26 |
## Leaderboard columns
|
27 |
auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
|
28 |
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
|
29 |
+
["model_report", ColumnContent, ColumnContent("Report", "markdown", True, never_hidden=True)]
|
30 |
]
|
31 |
# Init
|
32 |
# Scores
|
src/envs.py
CHANGED
@@ -6,14 +6,12 @@ from huggingface_hub import HfApi
|
|
6 |
TOKEN = os.environ.get("TOKEN", None)
|
7 |
|
8 |
OWNER = "latticeflow"
|
9 |
-
REPO_ID = f"{OWNER}/compl-ai-
|
10 |
QUEUE_REPO = f"{OWNER}/requests"
|
11 |
-
RESULTS_REPO = f"{OWNER}/results"
|
12 |
|
13 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
14 |
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
|
17 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
|
18 |
|
19 |
API = HfApi(token=TOKEN)
|
|
|
6 |
TOKEN = os.environ.get("TOKEN", None)
|
7 |
|
8 |
OWNER = "latticeflow"
|
9 |
+
# REPO_ID = f"{OWNER}/compl-ai-leaderboard"
|
10 |
QUEUE_REPO = f"{OWNER}/requests"
|
|
|
11 |
|
12 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
13 |
|
14 |
# Local caches
|
15 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
|
|
|
16 |
|
17 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -12,7 +12,7 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
|
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
def report_hyperlink(link):
|
15 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
@@ -40,7 +40,7 @@ class EvalResult:
|
|
40 |
data = json.load(fp)
|
41 |
|
42 |
config = data.get("config")
|
43 |
-
print(json_filepath)
|
44 |
# Precision
|
45 |
# precision = Precision.from_str(config.get("model_dtype"))
|
46 |
|
@@ -76,12 +76,12 @@ class EvalResult:
|
|
76 |
|
77 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
78 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
79 |
-
print('skip', full_model)
|
80 |
results[task.benchmark] = None
|
81 |
continue
|
82 |
|
83 |
-
print(task)
|
84 |
-
print(accs)
|
85 |
mean_acc = np.mean(accs) # * 100.0
|
86 |
results[task.benchmark] = round(mean_acc, 2)
|
87 |
|
@@ -108,8 +108,8 @@ class EvalResult:
|
|
108 |
try:
|
109 |
with open(request_file, "r") as f:
|
110 |
request = json.load(f)
|
111 |
-
print(f"Read Request from {request_file}")
|
112 |
-
print(request)
|
113 |
# self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
114 |
# self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
|
115 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
@@ -119,7 +119,7 @@ class EvalResult:
|
|
119 |
self.num_params = request.get("params", None)
|
120 |
self.date = request.get("submitted_time", "")
|
121 |
except Exception as e:
|
122 |
-
print(e)
|
123 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
124 |
print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
|
125 |
|
@@ -158,9 +158,9 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
|
|
158 |
requests_path,
|
159 |
f"**/request_{model_name}*_eval_request*.json"
|
160 |
)
|
161 |
-
print(f"Looking up request file(s) with pattern {request_files}")
|
162 |
request_files = glob.glob(request_files, recursive=True)
|
163 |
-
print(f"Found request file(s) {request_files}")
|
164 |
|
165 |
# Select correct request file (precision)
|
166 |
request_file = ""
|
@@ -174,7 +174,7 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
|
|
174 |
# and req_content["precision"] == precision.split(".")[-1]
|
175 |
):
|
176 |
request_file = tmp_request_file
|
177 |
-
print(f"Selected {request_file} for model metadata")
|
178 |
return request_file
|
179 |
|
180 |
|
@@ -200,10 +200,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
200 |
for model_result_filepath in model_result_filepaths:
|
201 |
# Creation of result
|
202 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
203 |
-
print()
|
204 |
-
print('eval result')
|
205 |
-
print(eval_result)
|
206 |
-
print()
|
207 |
eval_result.update_with_request_file(requests_path)
|
208 |
|
209 |
# Store results of same eval together
|
@@ -217,9 +217,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
217 |
|
218 |
for v in eval_results.values():
|
219 |
try:
|
220 |
-
print()
|
221 |
-
print(v)
|
222 |
-
print()
|
223 |
v.to_dict() # we test if the dict version is complete
|
224 |
results.append(v)
|
225 |
except KeyError: # not all eval values present
|
|
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
def report_hyperlink(link):
|
15 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗 Report</a>' if link else "N/A"
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
|
|
40 |
data = json.load(fp)
|
41 |
|
42 |
config = data.get("config")
|
43 |
+
# print(json_filepath)
|
44 |
# Precision
|
45 |
# precision = Precision.from_str(config.get("model_dtype"))
|
46 |
|
|
|
76 |
|
77 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
78 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
79 |
+
# print('skip', full_model)
|
80 |
results[task.benchmark] = None
|
81 |
continue
|
82 |
|
83 |
+
# print(task)
|
84 |
+
# print(accs)
|
85 |
mean_acc = np.mean(accs) # * 100.0
|
86 |
results[task.benchmark] = round(mean_acc, 2)
|
87 |
|
|
|
108 |
try:
|
109 |
with open(request_file, "r") as f:
|
110 |
request = json.load(f)
|
111 |
+
# print(f"Read Request from {request_file}")
|
112 |
+
# print(request)
|
113 |
# self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
114 |
# self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
|
115 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
|
|
119 |
self.num_params = request.get("params", None)
|
120 |
self.date = request.get("submitted_time", "")
|
121 |
except Exception as e:
|
122 |
+
# print(e)
|
123 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
124 |
print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
|
125 |
|
|
|
158 |
requests_path,
|
159 |
f"**/request_{model_name}*_eval_request*.json"
|
160 |
)
|
161 |
+
# print(f"Looking up request file(s) with pattern {request_files}")
|
162 |
request_files = glob.glob(request_files, recursive=True)
|
163 |
+
# print(f"Found request file(s) {request_files}")
|
164 |
|
165 |
# Select correct request file (precision)
|
166 |
request_file = ""
|
|
|
174 |
# and req_content["precision"] == precision.split(".")[-1]
|
175 |
):
|
176 |
request_file = tmp_request_file
|
177 |
+
# print(f"Selected {request_file} for model metadata")
|
178 |
return request_file
|
179 |
|
180 |
|
|
|
200 |
for model_result_filepath in model_result_filepaths:
|
201 |
# Creation of result
|
202 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
203 |
+
# print()
|
204 |
+
# print('eval result')
|
205 |
+
# print(eval_result)
|
206 |
+
# print()
|
207 |
eval_result.update_with_request_file(requests_path)
|
208 |
|
209 |
# Store results of same eval together
|
|
|
217 |
|
218 |
for v in eval_results.values():
|
219 |
try:
|
220 |
+
# print()
|
221 |
+
# print(v)
|
222 |
+
# print()
|
223 |
v.to_dict() # we test if the dict version is complete
|
224 |
results.append(v)
|
225 |
except KeyError: # not all eval values present
|
src/populate.py
CHANGED
@@ -11,7 +11,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
-
print(all_data_json)
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
17 |
df = df[cols].round(decimals=2)
|
@@ -40,7 +40,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
40 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
41 |
for sub_entry in sub_entries:
|
42 |
file_path = os.path.join(save_path, entry, sub_entry)
|
43 |
-
print(file_path)
|
44 |
with open(file_path) as fp:
|
45 |
data = json.load(fp)
|
46 |
|
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
+
# print(all_data_json)
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
17 |
df = df[cols].round(decimals=2)
|
|
|
40 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
41 |
for sub_entry in sub_entries:
|
42 |
file_path = os.path.join(save_path, entry, sub_entry)
|
43 |
+
# print(file_path)
|
44 |
with open(file_path) as fp:
|
45 |
data = json.load(fp)
|
46 |
|