Clémentine commited on
Commit
7689092
1 Parent(s): 816b1dc
custom_tasks.py CHANGED
@@ -1,6 +1,6 @@
1
  # ruff: noqa: F405, F403, F401
2
  """
3
- Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
4
 
5
  This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
6
 
 
1
  # ruff: noqa: F405, F403, F401
2
  """
3
+ Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
4
 
5
  This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
6
 
src/backend/manage_requests.py CHANGED
@@ -11,27 +11,32 @@ logger = setup_logger(__name__)
11
 
12
  @dataclass
13
  class EvalRequest:
 
 
14
  model: str
15
- private: bool
16
  status: str
17
  json_filepath: str
18
  weight_type: str = "Original"
19
  model_type: str = "" # pretrained, finetuned, with RL
20
  precision: str = "" # float16, bfloat16
21
- base_model: Optional[str] = None # for adapter models
22
- revision: str = "main" # commit
23
  submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
24
- model_type: Optional[str] = None
25
  likes: Optional[int] = 0
26
  params: Optional[int] = None
27
  license: Optional[str] = ""
28
 
29
  def get_model_args(self):
 
 
 
30
  model_args = f"pretrained={self.model},revision={self.revision}"
31
 
32
  if self.precision in ["float16", "bfloat16", "float32"]:
33
  model_args += f",dtype={self.precision}"
 
34
  # Quantized models need some added config, the install of bits and bytes, etc
 
35
  #elif self.precision == "8bit":
36
  # model_args += ",load_in_8bit=True"
37
  #elif self.precision == "4bit":
@@ -39,7 +44,6 @@ class EvalRequest:
39
  #elif self.precision == "GPTQ":
40
  # A GPTQ model does not need dtype to be specified,
41
  # it will be inferred from the config
42
- pass
43
  else:
44
  raise Exception(f"Unknown precision {self.precision}.")
45
 
@@ -67,7 +71,7 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
67
 
68
 
69
  def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
70
- """Get all pending evaluation requests and return a list in which private
71
  models appearing first, followed by public models sorted by the number of
72
  likes.
73
 
 
11
 
12
  @dataclass
13
  class EvalRequest:
14
+ """This class represents one evaluation request file.
15
+ """
16
  model: str
 
17
  status: str
18
  json_filepath: str
19
  weight_type: str = "Original"
20
  model_type: str = "" # pretrained, finetuned, with RL
21
  precision: str = "" # float16, bfloat16
22
+ revision: str = "main" # commit hash
 
23
  submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
24
+ model_type: Optional[str] = None # pretrained, fine-tuned, etc - define your own categories in
25
  likes: Optional[int] = 0
26
  params: Optional[int] = None
27
  license: Optional[str] = ""
28
 
29
  def get_model_args(self):
30
+ """Edit this function if you want to manage more complex quantization issues. You'll need to map it to
31
+ the evaluation suite you chose.
32
+ """
33
  model_args = f"pretrained={self.model},revision={self.revision}"
34
 
35
  if self.precision in ["float16", "bfloat16", "float32"]:
36
  model_args += f",dtype={self.precision}"
37
+
38
  # Quantized models need some added config, the install of bits and bytes, etc
39
+
40
  #elif self.precision == "8bit":
41
  # model_args += ",load_in_8bit=True"
42
  #elif self.precision == "4bit":
 
44
  #elif self.precision == "GPTQ":
45
  # A GPTQ model does not need dtype to be specified,
46
  # it will be inferred from the config
 
47
  else:
48
  raise Exception(f"Unknown precision {self.precision}.")
49
 
 
71
 
72
 
73
  def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
74
+ """Gets all pending evaluation requests and return a list in which private
75
  models appearing first, followed by public models sorted by the number of
76
  likes.
77
 
src/backend/run_eval_suite_harness.py CHANGED
@@ -12,7 +12,23 @@ from src.logging import setup_logger
12
  logging.getLogger("openai").setLevel(logging.WARNING)
13
  logger = setup_logger(__name__)
14
 
15
- def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  if limit:
17
  logger.info(
18
  "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
 
12
  logging.getLogger("openai").setLevel(logging.WARNING)
13
  logger = setup_logger(__name__)
14
 
15
+ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int, device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
16
+ """Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
17
+
18
+ Args:
19
+ eval_request (EvalRequest): Input evaluation request file representation
20
+ task_names (list): Tasks to launch
21
+ num_fewshot (int): Number of few shots to use
22
+ batch_size (int): Selected batch size
23
+ device (str): "cpu" or "gpu:0", depending on what you assigned to the space
24
+ local_dir (str): Where to save the results locally
25
+ results_repo (str): To which repository to upload the results
26
+ no_cache (bool, optional): Whether to use a cache or not.
27
+ limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
28
+
29
+ Returns:
30
+ _type_: _description_
31
+ """
32
  if limit:
33
  logger.info(
34
  "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
src/backend/run_eval_suite_lighteval.py CHANGED
@@ -13,6 +13,22 @@ logging.getLogger("openai").setLevel(logging.WARNING)
13
  logger = setup_logger(__name__)
14
 
15
  def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  if limit:
17
  logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
18
 
 
13
  logger = setup_logger(__name__)
14
 
15
  def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
16
+ """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
17
+
18
+ Args:
19
+ eval_request (EvalRequest): Input evaluation request file representation
20
+ task_names (list): Tasks to launch
21
+ batch_size (int): Selected batch size
22
+ accelerator (str): Inference endpoint parameter for running the evaluation
23
+ region (str): Inference endpoint parameter for running the evaluation
24
+ vendor (str): Inference endpoint parameter for running the evaluation
25
+ instance_size (str): Inference endpoint parameter for running the evaluation
26
+ instance_type (str): Inference endpoint parameter for running the evaluation
27
+ local_dir (str): Where to save the results locally
28
+ no_cache (bool, optional): Whether to use a cache or not.
29
+ limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
30
+ """
31
+
32
  if limit:
33
  logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
34
 
src/backend/sort_queue.py CHANGED
@@ -11,7 +11,7 @@ class ModelMetadata:
11
  likes: int = 0
12
  size: int = 15
13
 
14
-
15
  def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
  private_models = [model for model in models if model.private]
17
  public_models = [model for model in models if not model.private]
 
11
  likes: int = 0
12
  size: int = 15
13
 
14
+ # All the functions below sort the models in the queue based on different parameters
15
  def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
  private_models = [model for model in models if model.private]
17
  public_models = [model for model in models if not model.private]