eduagarcia commited on
Commit
3449f84
1 Parent(s): 36e3010

New tasks difinitions

Browse files
Files changed (1) hide show
  1. src/display/utils.py +60 -7
src/display/utils.py CHANGED
@@ -1,6 +1,6 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
-
4
  import pandas as pd
5
 
6
  def fields(raw_class):
@@ -14,14 +14,67 @@ class Task:
14
  col_name: str
15
  baseline: float = 0.0
16
  human_baseline: float = 0.0
 
 
 
 
 
17
 
18
  class Tasks(Enum):
19
- arc = Task("arc:challenge", "acc_norm", "ARC", 25.0, 80.0)
20
- hellaswag = Task("hellaswag", "acc_norm", "HellaSwag", 25.0, 95.0)
21
- mmlu = Task("hendrycksTest", "acc", "MMLU", 25.0, 89.8)
22
- truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA", 25.0, 94.0)
23
- winogrande = Task("winogrande", "acc", "Winogrande", 50.0, 94.0)
24
- gsm8k = Task("gsm8k", "acc", "GSM8K", 0.21, 100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # These classes are for user facing column names,
27
  # to avoid having to change them all around the code
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
+ from typing import List
4
  import pandas as pd
5
 
6
  def fields(raw_class):
 
14
  col_name: str
15
  baseline: float = 0.0
16
  human_baseline: float = 0.0
17
+ few_shot: int = None
18
+ limit: int = None
19
+ task_list: List[str] = None
20
+ link: str = None
21
+ description: str = None
22
 
23
  class Tasks(Enum):
24
+ oab_exams = Task(
25
+ benchmark="oab_exams",
26
+ metric="exact_match",
27
+ col_name="OAB Exams",
28
+ baseline=25.0,
29
+ human_baseline=50.0,
30
+ few_shot=5,
31
+ limit=None,
32
+ task_list=["oab_exams_generate"],
33
+ link="https://huggingface.co/datasets/eduagarcia/oab_exams",
34
+ description="OAB Exams is a dataset of 1,000 questions from the Brazilian Bar Association's exams."
35
+ )
36
+ brazilian_court_decisions_judgment = Task(
37
+ benchmark="brazilian_court_decisions_judgment",
38
+ metric="f1_macro",
39
+ col_name="BR Court Decisions",
40
+ baseline=33.33,
41
+ human_baseline=100.0,
42
+ few_shot=5,
43
+ limit=None,
44
+ task_list=["brazilian_court_decisions_judgment_generate"],
45
+ link="https://huggingface.co/datasets/joelniklaus/brazilian_court_decisions",
46
+ description="A classification dataset of court decisions from the Tribunal de Justiça de Alagoas (TJAL, the State Supreme Court of Alagoas (Brazil)."
47
+ )
48
+ datalawyer_frases = Task(
49
+ benchmark="datalawyer_frases",
50
+ metric="f1_macro",
51
+ col_name="DL Frases",
52
+ baseline=10.0,
53
+ human_baseline=100.0,
54
+ few_shot=15,
55
+ limit=2000,
56
+ task_list=["datalawyer_frases_generate"],
57
+ link="https://huggingface.co/datasets/eduagarcia/portuguese_benchmark",
58
+ description="A classification dataset"
59
+ )
60
+ rrip = Task(
61
+ benchmark="rrip",
62
+ metric="f1_macro",
63
+ col_name="RRIP",
64
+ baseline=12.5,
65
+ human_baseline=100.0,
66
+ few_shot=15,
67
+ limit=None,
68
+ task_list=["rrip_generate"],
69
+ link="https://huggingface.co/datasets/eduagarcia/portuguese_benchmark",
70
+ description="A classification dataset"
71
+ )
72
+ #arc = Task("arc:challenge", "acc_norm", "ARC", 25.0, 80.0)
73
+ #hellaswag = Task("hellaswag", "acc_norm", "HellaSwag", 25.0, 95.0)
74
+ #mmlu = Task("hendrycksTest", "acc", "MMLU", 25.0, 89.8)
75
+ #truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA", 25.0, 94.0)
76
+ #winogrande = Task("winogrande", "acc", "Winogrande", 50.0, 94.0)
77
+ #gsm8k = Task("gsm8k", "acc", "GSM8K", 0.21, 100)
78
 
79
  # These classes are for user facing column names,
80
  # to avoid having to change them all around the code