Clémentine commited on
Commit
df66f6e
1 Parent(s): bb17be3

refacto style + rate limit

Browse files
app.py CHANGED
@@ -6,18 +6,6 @@ import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
 
9
- from src.display.utils import (
10
- COLS,
11
- TYPES,
12
- BENCHMARK_COLS,
13
- EVAL_COLS,
14
- EVAL_TYPES,
15
- AutoEvalColumn,
16
- ModelType,
17
- NUMERIC_INTERVALS,
18
- fields,
19
- )
20
- from src.display.css_html_js import custom_css, get_window_url_params
21
  from src.display.about import (
22
  CITATION_BUTTON_LABEL,
23
  CITATION_BUTTON_TEXT,
@@ -26,17 +14,29 @@ from src.display.about import (
26
  LLM_BENCHMARKS_TEXT,
27
  TITLE,
28
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  from src.tools.plots import (
 
30
  create_metric_plot_obj,
31
- create_scores_df,
32
  create_plot_df,
 
33
  join_model_info_with_results,
34
- HUMAN_BASELINES,
35
  )
36
- from src.tools.collections import update_collections
37
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
38
- from src.envs import H4_TOKEN, QUEUE_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, API, REPO_ID, IS_PUBLIC
39
- from src.submission.submit import add_new_eval
40
 
41
 
42
  def restart_space():
@@ -61,9 +61,9 @@ original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
61
  update_collections(original_df.copy())
62
  leaderboard_df = original_df.copy()
63
 
64
- #models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
65
  # plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
66
- #to_be_dumped = f"models = {repr(models)}\n"
67
 
68
  (
69
  finished_eval_queue_df,
@@ -173,8 +173,16 @@ with demo:
173
  )
174
  with gr.Row():
175
  shown_columns = gr.CheckboxGroup(
176
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.dummy],
177
- value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden],
 
 
 
 
 
 
 
 
178
  label="Select columns to show",
179
  elem_id="column-select",
180
  interactive=True,
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from src.display.about import (
10
  CITATION_BUTTON_LABEL,
11
  CITATION_BUTTON_TEXT,
 
14
  LLM_BENCHMARKS_TEXT,
15
  TITLE,
16
  )
17
+ from src.display.css_html_js import custom_css, get_window_url_params
18
+ from src.display.utils import (
19
+ BENCHMARK_COLS,
20
+ COLS,
21
+ EVAL_COLS,
22
+ EVAL_TYPES,
23
+ NUMERIC_INTERVALS,
24
+ TYPES,
25
+ AutoEvalColumn,
26
+ ModelType,
27
+ fields,
28
+ )
29
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
30
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
+ from src.submission.submit import add_new_eval
32
+ from src.tools.collections import update_collections
33
  from src.tools.plots import (
34
+ HUMAN_BASELINES,
35
  create_metric_plot_obj,
 
36
  create_plot_df,
37
+ create_scores_df,
38
  join_model_info_with_results,
 
39
  )
 
 
 
 
40
 
41
 
42
  def restart_space():
 
61
  update_collections(original_df.copy())
62
  leaderboard_df = original_df.copy()
63
 
64
+ # models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
65
  # plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
66
+ # to_be_dumped = f"models = {repr(models)}\n"
67
 
68
  (
69
  finished_eval_queue_df,
 
173
  )
174
  with gr.Row():
175
  shown_columns = gr.CheckboxGroup(
176
+ choices=[
177
+ c.name
178
+ for c in fields(AutoEvalColumn)
179
+ if not c.hidden and not c.never_hidden and not c.dummy
180
+ ],
181
+ value=[
182
+ c.name
183
+ for c in fields(AutoEvalColumn)
184
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
185
+ ],
186
  label="Select columns to show",
187
  elem_id="column-select",
188
  interactive=True,
scripts/create_request_file.py CHANGED
@@ -1,11 +1,12 @@
1
- from datetime import datetime, timezone
2
  import json
3
  import os
 
4
  import re
 
 
5
  import click
6
- from huggingface_hub import HfApi, snapshot_download
7
  from colorama import Fore
8
- import pprint
9
 
10
  EVAL_REQUESTS_PATH = "eval-queue"
11
  QUEUE_REPO = "open-llm-leaderboard/requests"
 
 
1
  import json
2
  import os
3
+ import pprint
4
  import re
5
+ from datetime import datetime, timezone
6
+
7
  import click
 
8
  from colorama import Fore
9
+ from huggingface_hub import HfApi, snapshot_download
10
 
11
  EVAL_REQUESTS_PATH = "eval-queue"
12
  QUEUE_REPO = "open-llm-leaderboard/requests"
src/display/formatting.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from huggingface_hub import HfApi
3
 
4
  API = HfApi()
 
1
  import os
2
+
3
  from huggingface_hub import HfApi
4
 
5
  API = HfApi()
src/display/utils.py CHANGED
@@ -1,7 +1,8 @@
1
  from dataclasses import dataclass
2
- import pandas as pd
3
  from enum import Enum
4
 
 
 
5
 
6
  # These classes are for user facing column names,
7
  # to avoid having to change them all around the code
 
1
  from dataclasses import dataclass
 
2
  from enum import Enum
3
 
4
+ import pandas as pd
5
+
6
 
7
  # These classes are for user facing column names,
8
  # to avoid having to change them all around the code
src/envs.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from huggingface_hub import HfApi
3
 
4
  # clone / pull the lmeh eval data
@@ -24,5 +25,6 @@ PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c796
24
  # Rate limit variables
25
  RATE_LIMIT_PERIOD = 7
26
  RATE_LIMIT_QUOTA = 5
 
27
 
28
  API = HfApi(token=H4_TOKEN)
 
1
  import os
2
+
3
  from huggingface_hub import HfApi
4
 
5
  # clone / pull the lmeh eval data
 
25
  # Rate limit variables
26
  RATE_LIMIT_PERIOD = 7
27
  RATE_LIMIT_QUOTA = 5
28
+ HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
29
 
30
  API = HfApi(token=H4_TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -1,15 +1,15 @@
 
1
  import json
2
- import os
3
  import math
4
- import glob
5
  from dataclasses import dataclass
6
  from typing import Dict, List, Tuple
7
 
8
  import dateutil
9
  import numpy as np
10
 
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks
12
  from src.display.formatting import make_clickable_model
 
13
  from src.submission.check_validity import is_model_on_hub
14
 
15
 
@@ -56,7 +56,9 @@ class EvalResult:
56
  model = org_and_model[1]
57
  result_key = f"{org}_{model}_{precision}"
58
 
59
- still_on_hub = is_model_on_hub("/".join(org_and_model), config.get("model_sha", "main"), trust_remote_code=True)[0]
 
 
60
 
61
  # Extract results available in this file (some results are split in several files)
62
  results = {}
@@ -73,8 +75,8 @@ class EvalResult:
73
  continue
74
 
75
  # Some truthfulQA values are NaNs
76
- if task.benchmark == "truthfulqa:mc" and 'harness|truthfulqa:mc|0' in data["results"]:
77
- if math.isnan(float(data["results"]['harness|truthfulqa:mc|0'][task.metric])):
78
  results[task.benchmark] = 0.0
79
  continue
80
 
@@ -191,7 +193,7 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
191
  for v in eval_results.values():
192
  try:
193
  results.append(v.to_dict())
194
- except KeyError: # not all eval values present
195
  continue
196
 
197
  return results
 
1
+ import glob
2
  import json
 
3
  import math
4
+ import os
5
  from dataclasses import dataclass
6
  from typing import Dict, List, Tuple
7
 
8
  import dateutil
9
  import numpy as np
10
 
 
11
  from src.display.formatting import make_clickable_model
12
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks
13
  from src.submission.check_validity import is_model_on_hub
14
 
15
 
 
56
  model = org_and_model[1]
57
  result_key = f"{org}_{model}_{precision}"
58
 
59
+ still_on_hub = is_model_on_hub(
60
+ "/".join(org_and_model), config.get("model_sha", "main"), trust_remote_code=True
61
+ )[0]
62
 
63
  # Extract results available in this file (some results are split in several files)
64
  results = {}
 
75
  continue
76
 
77
  # Some truthfulQA values are NaNs
78
+ if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
79
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
80
  results[task.benchmark] = 0.0
81
  continue
82
 
 
193
  for v in eval_results.values():
194
  try:
195
  results.append(v.to_dict())
196
+ except KeyError: # not all eval values present
197
  continue
198
 
199
  return results
src/populate.py CHANGED
@@ -3,10 +3,10 @@ import os
3
 
4
  import pandas as pd
5
 
 
 
6
  from src.leaderboard.filter_models import filter_models
7
  from src.leaderboard.read_evals import get_eval_results
8
- from src.display.formatting import make_clickable_model, has_no_nan_values
9
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
10
 
11
 
12
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
3
 
4
  import pandas as pd
5
 
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
  from src.leaderboard.filter_models import filter_models
9
  from src.leaderboard.read_evals import get_eval_results
 
 
10
 
11
 
12
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
src/submission/check_validity.py CHANGED
@@ -1,13 +1,15 @@
1
- import huggingface_hub
2
- import os
3
  import json
 
4
  import re
5
  from collections import defaultdict
6
- from huggingface_hub.hf_api import ModelInfo
 
 
7
  from huggingface_hub import ModelCard
 
8
  from transformers import AutoConfig
9
 
10
- from datetime import datetime, timedelta, timezone
11
 
12
 
13
  # ht to @Wauplin, thank you for the snippet!
@@ -76,6 +78,9 @@ def user_submission_permission(submission_name, users_to_submission_dates, rate_
76
  submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
77
 
78
  num_models_submitted_in_period = len(submissions_after_timelimit)
 
 
 
79
  if num_models_submitted_in_period > rate_limit_quota:
80
  error_msg = f"Organisation or user `{org_or_user}`"
81
  error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
 
 
 
1
  import json
2
+ import os
3
  import re
4
  from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
  from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
 
12
+ from src.envs import HAS_HIGHER_RATE_LIMIT
13
 
14
 
15
  # ht to @Wauplin, thank you for the snippet!
 
78
  submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
79
 
80
  num_models_submitted_in_period = len(submissions_after_timelimit)
81
+ if org_or_user in HAS_HIGHER_RATE_LIMIT:
82
+ rate_limit_quota = 2 * rate_limit_quota
83
+
84
  if num_models_submitted_in_period > rate_limit_quota:
85
  error_msg = f"Organisation or user `{org_or_user}`"
86
  error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
src/submission/submit.py CHANGED
@@ -1,17 +1,17 @@
1
- import os, json
2
-
3
  from datetime import datetime, timezone
4
 
5
- from src.display.formatting import styled_error, styled_warning, styled_message
 
6
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
7
  from src.submission.check_validity import (
8
- user_submission_permission,
9
- is_model_on_hub,
10
- get_model_size,
11
- check_model_card,
12
  already_submitted_models,
 
 
 
 
13
  )
14
- from src.envs import RATE_LIMIT_QUOTA, RATE_LIMIT_PERIOD, H4_TOKEN, EVAL_REQUESTS_PATH, API, QUEUE_REPO
15
 
16
  requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
17
 
 
1
+ import json
2
+ import os
3
  from datetime import datetime, timezone
4
 
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
7
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
8
  from src.submission.check_validity import (
 
 
 
 
9
  already_submitted_models,
10
+ check_model_card,
11
+ get_model_size,
12
+ is_model_on_hub,
13
+ user_submission_permission,
14
  )
 
15
 
16
  requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
17
 
src/tools/collections.py CHANGED
@@ -1,11 +1,11 @@
1
  import os
 
2
  import pandas as pd
3
- from pandas import DataFrame
4
- from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
5
  from huggingface_hub.utils._errors import HfHubHTTPError
 
6
 
7
  from src.display.utils import AutoEvalColumn, ModelType
8
-
9
  from src.envs import H4_TOKEN, PATH_TO_COLLECTION
10
 
11
  # Specific intervals for the collections
 
1
  import os
2
+
3
  import pandas as pd
4
+ from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
 
5
  from huggingface_hub.utils._errors import HfHubHTTPError
6
+ from pandas import DataFrame
7
 
8
  from src.display.utils import AutoEvalColumn, ModelType
 
9
  from src.envs import H4_TOKEN, PATH_TO_COLLECTION
10
 
11
  # Specific intervals for the collections
src/tools/plots.py CHANGED
@@ -1,9 +1,11 @@
 
 
 
 
1
  import pandas as pd
2
  import plotly.express as px
3
  from plotly.graph_objs import Figure
4
- import pickle
5
- from datetime import datetime, timezone
6
- from typing import List, Dict, Tuple, Any
7
  from src.leaderboard.filter_models import FLAGGED_MODELS
8
 
9
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 
1
+ import pickle
2
+ from datetime import datetime, timezone
3
+ from typing import Any, Dict, List, Tuple
4
+
5
  import pandas as pd
6
  import plotly.express as px
7
  from plotly.graph_objs import Figure
8
+
 
 
9
  from src.leaderboard.filter_models import FLAGGED_MODELS
10
 
11
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)