open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

eduagarcia commited on Jan 21

Commit

3449f84

•

1 Parent(s): 36e3010

New tasks difinitions

Browse files

Files changed (1) hide show

src/display/utils.py +60 -7

src/display/utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 import pandas as pd
 def fields(raw_class):
@@ -14,14 +14,67 @@ class Task:
     col_name: str
     baseline: float = 0.0
     human_baseline: float = 0.0
 class Tasks(Enum):
-    arc = Task("arc:challenge", "acc_norm", "ARC", 25.0, 80.0)
-    hellaswag = Task("hellaswag", "acc_norm", "HellaSwag", 25.0, 95.0)
-    mmlu = Task("hendrycksTest", "acc", "MMLU", 25.0, 89.8)
-    truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA", 25.0, 94.0)
-    winogrande = Task("winogrande", "acc", "Winogrande", 50.0, 94.0)
-    gsm8k = Task("gsm8k", "acc", "GSM8K", 0.21, 100)
 # These classes are for user facing column names,
 # to avoid having to change them all around the code

 from dataclasses import dataclass, make_dataclass
 from enum import Enum
+from typing import List
 import pandas as pd
 def fields(raw_class):
     col_name: str
     baseline: float = 0.0
     human_baseline: float = 0.0
+    few_shot: int = None
+    limit: int = None
+    task_list: List[str] = None
+    link: str = None
+    description: str = None
 class Tasks(Enum):
+    oab_exams = Task(
+        benchmark="oab_exams",
+        metric="exact_match",
+        col_name="OAB Exams",
+        baseline=25.0,
+        human_baseline=50.0,
+        few_shot=5,
+        limit=None,
+        task_list=["oab_exams_generate"],
+        link="https://huggingface.co/datasets/eduagarcia/oab_exams",
+        description="OAB Exams is a dataset of 1,000 questions from the Brazilian Bar Association's exams."
+    )
+    brazilian_court_decisions_judgment = Task(
+        benchmark="brazilian_court_decisions_judgment",
+        metric="f1_macro",
+        col_name="BR Court Decisions",
+        baseline=33.33,
+        human_baseline=100.0,
+        few_shot=5,
+        limit=None,
+        task_list=["brazilian_court_decisions_judgment_generate"],
+        link="https://huggingface.co/datasets/joelniklaus/brazilian_court_decisions",
+        description="A classification dataset of court decisions from the Tribunal de Justiça de Alagoas (TJAL, the State Supreme Court of Alagoas (Brazil)."
+    )
+    datalawyer_frases = Task(
+        benchmark="datalawyer_frases",
+        metric="f1_macro",
+        col_name="DL Frases",
+        baseline=10.0,
+        human_baseline=100.0,
+        few_shot=15,
+        limit=2000,
+        task_list=["datalawyer_frases_generate"],
+        link="https://huggingface.co/datasets/eduagarcia/portuguese_benchmark",
+        description="A classification dataset"
+    )
+    rrip = Task(
+        benchmark="rrip",
+        metric="f1_macro",
+        col_name="RRIP",
+        baseline=12.5,
+        human_baseline=100.0,
+        few_shot=15,
+        limit=None,
+        task_list=["rrip_generate"],
+        link="https://huggingface.co/datasets/eduagarcia/portuguese_benchmark",
+        description="A classification dataset"
+    )
+    #arc = Task("arc:challenge", "acc_norm", "ARC", 25.0, 80.0)
+    #hellaswag = Task("hellaswag", "acc_norm", "HellaSwag", 25.0, 95.0)
+    #mmlu = Task("hendrycksTest", "acc", "MMLU", 25.0, 89.8)
+    #truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA", 25.0, 94.0)
+    #winogrande = Task("winogrande", "acc", "Winogrande", 50.0, 94.0)
+    #gsm8k = Task("gsm8k", "acc", "GSM8K", 0.21, 100)
 # These classes are for user facing column names,
 # to avoid having to change them all around the code