Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
•
c0fa950
1
Parent(s):
0266144
Portuguese Tasks configs and fix bugs
Browse files- .gitignore +4 -1
- src/display/utils.py +14 -12
- src/envs.py +34 -15
- src/tools/plots.py +1 -1
- tasks_config.yaml → tasks_config/legal_config.yaml +0 -0
- tasks_config/pt_config.yaml +153 -0
.gitignore
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
venv/
|
2 |
__pycache__/
|
3 |
-
.env
|
4 |
.ipynb_checkpoints
|
5 |
*ipynb
|
6 |
.vscode/
|
@@ -12,5 +12,8 @@ original_results/
|
|
12 |
eval-queue/
|
13 |
eval-results/
|
14 |
dynamic-info/
|
|
|
|
|
|
|
15 |
|
16 |
src/assets/model_counts.html
|
|
|
1 |
venv/
|
2 |
__pycache__/
|
3 |
+
.env*
|
4 |
.ipynb_checkpoints
|
5 |
*ipynb
|
6 |
.vscode/
|
|
|
12 |
eval-queue/
|
13 |
eval-results/
|
14 |
dynamic-info/
|
15 |
+
downloads/
|
16 |
+
|
17 |
+
tasks_config/legal_config.yaml
|
18 |
|
19 |
src/assets/model_counts.html
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
from typing import List
|
4 |
import pandas as pd
|
5 |
from yaml import safe_load
|
6 |
-
from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -15,16 +15,17 @@ class Task:
|
|
15 |
metric: str
|
16 |
col_name: str
|
17 |
baseline: float = 0.0
|
18 |
-
human_baseline: float =
|
|
|
19 |
few_shot: int = None
|
20 |
limit: int = None
|
21 |
task_list: List[str] = None
|
22 |
link: str = None
|
23 |
description: str = None
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
tasks_config = safe_load(f)
|
27 |
-
Tasks = Enum('Tasks', {k: Task(**v) for k, v in tasks_config['tasks'].items()})
|
28 |
|
29 |
# These classes are for user facing column names,
|
30 |
# to avoid having to change them all around the code
|
@@ -108,11 +109,12 @@ baseline_row = {
|
|
108 |
baseline_list = []
|
109 |
for task in Tasks:
|
110 |
baseline_row[task.value.col_name] = task.value.baseline
|
111 |
-
if task.value.baseline is not None:
|
112 |
baseline_list.append(task.value.baseline)
|
113 |
baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
114 |
-
|
115 |
-
|
|
|
116 |
|
117 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
118 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
@@ -145,17 +147,17 @@ human_baseline_row = {
|
|
145 |
AutoEvalColumn.license.name: "",
|
146 |
AutoEvalColumn.still_on_hub.name: False,
|
147 |
AutoEvalColumn.moe.name: False,
|
148 |
-
AutoEvalColumn.eval_time.name: 0.0
|
149 |
}
|
150 |
|
151 |
baseline_list = []
|
152 |
for task in Tasks:
|
153 |
human_baseline_row[task.value.col_name] = task.value.human_baseline
|
154 |
-
if task.value.human_baseline is not None:
|
155 |
baseline_list.append(task.value.human_baseline)
|
156 |
human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
157 |
-
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
158 |
-
|
159 |
|
160 |
@dataclass
|
161 |
class ModelDetails:
|
|
|
3 |
from typing import List
|
4 |
import pandas as pd
|
5 |
from yaml import safe_load
|
6 |
+
from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, TASK_CONFIG
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
15 |
metric: str
|
16 |
col_name: str
|
17 |
baseline: float = 0.0
|
18 |
+
human_baseline: float = None
|
19 |
+
expert_human_baseline: float = None
|
20 |
few_shot: int = None
|
21 |
limit: int = None
|
22 |
task_list: List[str] = None
|
23 |
link: str = None
|
24 |
description: str = None
|
25 |
+
sources: List[str] = None
|
26 |
+
baseline_sources: List[str] = None
|
27 |
|
28 |
+
Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})
|
|
|
|
|
29 |
|
30 |
# These classes are for user facing column names,
|
31 |
# to avoid having to change them all around the code
|
|
|
109 |
baseline_list = []
|
110 |
for task in Tasks:
|
111 |
baseline_row[task.value.col_name] = task.value.baseline
|
112 |
+
if task.value.baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
|
113 |
baseline_list.append(task.value.baseline)
|
114 |
baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
115 |
+
|
116 |
+
#if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
117 |
+
baseline_row["🤗 Leaderboard Average"] = None
|
118 |
|
119 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
120 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
|
|
147 |
AutoEvalColumn.license.name: "",
|
148 |
AutoEvalColumn.still_on_hub.name: False,
|
149 |
AutoEvalColumn.moe.name: False,
|
150 |
+
AutoEvalColumn.eval_time.name: 0.0,
|
151 |
}
|
152 |
|
153 |
baseline_list = []
|
154 |
for task in Tasks:
|
155 |
human_baseline_row[task.value.col_name] = task.value.human_baseline
|
156 |
+
if task.value.human_baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
|
157 |
baseline_list.append(task.value.human_baseline)
|
158 |
human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
|
159 |
+
#if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
160 |
+
human_baseline_row["🤗 Leaderboard Average"] = None
|
161 |
|
162 |
@dataclass
|
163 |
class ModelDetails:
|
src/envs.py
CHANGED
@@ -1,25 +1,44 @@
|
|
1 |
import os
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# clone / pull the lmeh eval data
|
6 |
-
H4_TOKEN =
|
7 |
|
8 |
-
LEADERBOARD_NAME =
|
9 |
|
10 |
-
REPO_ID =
|
11 |
-
QUEUE_REPO =
|
12 |
-
DYNAMIC_INFO_REPO =
|
13 |
-
RESULTS_REPO =
|
|
|
14 |
|
15 |
PRIVATE_QUEUE_REPO = QUEUE_REPO
|
16 |
PRIVATE_RESULTS_REPO = RESULTS_REPO
|
17 |
#PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
18 |
#PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
19 |
|
20 |
-
IS_PUBLIC = bool(
|
21 |
|
22 |
-
CACHE_PATH=
|
23 |
|
24 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
25 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
@@ -29,18 +48,18 @@ DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
|
|
29 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
30 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
31 |
|
32 |
-
PATH_TO_COLLECTION =
|
33 |
|
34 |
# Rate limit variables
|
35 |
-
RATE_LIMIT_PERIOD = int(
|
36 |
-
RATE_LIMIT_QUOTA = int(
|
37 |
-
HAS_HIGHER_RATE_LIMIT =
|
38 |
|
39 |
-
TRUST_REMOTE_CODE = bool(
|
40 |
|
41 |
#Set if you want to get an extra field with the average eval results from the HF leaderboard
|
42 |
-
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(
|
43 |
-
ORIGINAL_HF_LEADERBOARD_RESULTS_REPO =
|
44 |
ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
|
45 |
|
46 |
API = HfApi(token=H4_TOKEN)
|
|
|
1 |
import os
|
2 |
+
from yaml import safe_load
|
3 |
|
4 |
from huggingface_hub import HfApi
|
5 |
|
6 |
+
TASK_CONFIG_NAME = os.getenv("TASK_CONFIG", "pt_config")
|
7 |
+
TASK_CONFIG_PATH = os.path.join('tasks_config', TASK_CONFIG_NAME + ".yaml")
|
8 |
+
with open(TASK_CONFIG_PATH, 'r', encoding='utf-8') as f:
|
9 |
+
TASK_CONFIG = safe_load(f)
|
10 |
+
|
11 |
+
def get_config(name, default):
|
12 |
+
res = None
|
13 |
+
|
14 |
+
if name in os.environ:
|
15 |
+
res = os.environ[name]
|
16 |
+
elif 'config' in TASK_CONFIG:
|
17 |
+
res = TASK_CONFIG['config'].get(name, None)
|
18 |
+
|
19 |
+
if res is None:
|
20 |
+
return default
|
21 |
+
return res
|
22 |
+
|
23 |
# clone / pull the lmeh eval data
|
24 |
+
H4_TOKEN = get_config("H4_TOKEN", None)
|
25 |
|
26 |
+
LEADERBOARD_NAME = get_config("LEADERBOARD_NAME", "Open LLM Leaderboard")
|
27 |
|
28 |
+
REPO_ID = get_config("REPO_ID", "HuggingFaceH4/open_llm_leaderboard")
|
29 |
+
QUEUE_REPO = get_config("QUEUE_REPO", "open-llm-leaderboard/requests")
|
30 |
+
DYNAMIC_INFO_REPO = get_config("DYNAMIC_INFO_REPO", "open-llm-leaderboard/dynamic_model_information")
|
31 |
+
RESULTS_REPO = get_config("RESULTS_REPO", "open-llm-leaderboard/results")
|
32 |
+
RAW_RESULTS_REPO = get_config("RAW_RESgit sULTS_REPO", None)
|
33 |
|
34 |
PRIVATE_QUEUE_REPO = QUEUE_REPO
|
35 |
PRIVATE_RESULTS_REPO = RESULTS_REPO
|
36 |
#PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
37 |
#PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
38 |
|
39 |
+
IS_PUBLIC = bool(get_config("IS_PUBLIC", True))
|
40 |
|
41 |
+
CACHE_PATH=get_config("HF_HOME", ".")
|
42 |
|
43 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
44 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
48 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
49 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
50 |
|
51 |
+
PATH_TO_COLLECTION = get_config("PATH_TO_COLLECTION", "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03")
|
52 |
|
53 |
# Rate limit variables
|
54 |
+
RATE_LIMIT_PERIOD = int(get_config("RATE_LIMIT_PERIOD", 7))
|
55 |
+
RATE_LIMIT_QUOTA = int(get_config("RATE_LIMIT_QUOTA", 5))
|
56 |
+
HAS_HIGHER_RATE_LIMIT = get_config("HAS_HIGHER_RATE_LIMIT", "TheBloke").split(',')
|
57 |
|
58 |
+
TRUST_REMOTE_CODE = bool(get_config("TRUST_REMOTE_CODE", False))
|
59 |
|
60 |
#Set if you want to get an extra field with the average eval results from the HF leaderboard
|
61 |
+
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(get_config("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
|
62 |
+
ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
|
63 |
ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
|
64 |
|
65 |
API = HfApi(token=H4_TOKEN)
|
src/tools/plots.py
CHANGED
@@ -99,7 +99,7 @@ def create_metric_plot_obj(
|
|
99 |
df = df[df["task"].isin(metrics)]
|
100 |
|
101 |
# Filter the human baselines based on the specified metrics
|
102 |
-
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
103 |
|
104 |
# Create a line figure using plotly express with specified markers and custom data
|
105 |
fig = px.line(
|
|
|
99 |
df = df[df["task"].isin(metrics)]
|
100 |
|
101 |
# Filter the human baselines based on the specified metrics
|
102 |
+
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}
|
103 |
|
104 |
# Create a line figure using plotly express with specified markers and custom data
|
105 |
fig = px.line(
|
tasks_config.yaml → tasks_config/legal_config.yaml
RENAMED
File without changes
|
tasks_config/pt_config.yaml
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 1.0.0
|
2 |
+
config:
|
3 |
+
REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
|
4 |
+
QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
|
5 |
+
RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
|
6 |
+
RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
|
7 |
+
DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
|
8 |
+
PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
|
9 |
+
IS_PUBLIC: true
|
10 |
+
LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
|
11 |
+
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
|
12 |
+
TRUST_REMOTE_CODE: true
|
13 |
+
tasks:
|
14 |
+
enem_challenge:
|
15 |
+
benchmark: enem_challenge
|
16 |
+
col_name: ENEM
|
17 |
+
task_list:
|
18 |
+
- enem_challenge
|
19 |
+
metric: acc
|
20 |
+
few_shot: 3
|
21 |
+
limit: null
|
22 |
+
baseline: 20.0 #random baseline
|
23 |
+
#https://www.sejalguem.com/enem
|
24 |
+
#https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
|
25 |
+
human_baseline: 35.0 # ~60 / 180 acertos - nota ~500
|
26 |
+
expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
|
27 |
+
description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
|
28 |
+
level exam widely applied every year by the Brazilian government to students that
|
29 |
+
wish to undertake a University degree. This dataset contains 1,430 questions that don't require
|
30 |
+
image understanding of the exams from 2010 to 2018, 2022 and 2023."
|
31 |
+
link: https://huggingface.co/datasets/eduagarcia/enem_challenge
|
32 |
+
sources: ["https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
|
33 |
+
baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
|
34 |
+
bluex:
|
35 |
+
benchmark: bluex
|
36 |
+
col_name: BLUEX
|
37 |
+
task_list:
|
38 |
+
- bluex
|
39 |
+
metric: acc
|
40 |
+
few_shot: 3
|
41 |
+
limit: null
|
42 |
+
baseline: 22.5 #random baseline
|
43 |
+
#https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99
|
44 |
+
#https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4% - ~77% @ top-.99
|
45 |
+
human_baseline: 50.0
|
46 |
+
expert_human_baseline: 82.5
|
47 |
+
description: "BLUEX is a multimodal dataset consisting of the two leading
|
48 |
+
university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP),
|
49 |
+
spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"
|
50 |
+
link: https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images
|
51 |
+
sources: ["https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
|
52 |
+
baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
|
53 |
+
oab_exams:
|
54 |
+
benchmark: oab_exams
|
55 |
+
col_name: OAB Exams
|
56 |
+
task_list:
|
57 |
+
- oab_exams
|
58 |
+
metric: acc
|
59 |
+
few_shot: 3
|
60 |
+
limit: null
|
61 |
+
baseline: 25.0 #random baseline
|
62 |
+
#https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
|
63 |
+
# http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
|
64 |
+
# Acertou +70% = 17214 / 638500 = top-97,5%
|
65 |
+
# desvio top-97,5% -> 46 - 70.0% = 24
|
66 |
+
# z score 97,5% ~ 1,9675
|
67 |
+
# desvio padrao estimado -> 12,2
|
68 |
+
# top 99% = 46 + 2,33*12,2 = ~75.0
|
69 |
+
human_baseline: 46.0
|
70 |
+
expert_human_baseline: 75.0
|
71 |
+
description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
|
72 |
+
Association's exams, from 2010 to 2018.
|
73 |
+
link: https://huggingface.co/datasets/eduagarcia/oab_exams
|
74 |
+
sources: ["https://github.com/legal-nlp/oab-exams"]
|
75 |
+
baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
|
76 |
+
assin2_rte:
|
77 |
+
benchmark: assin2_rte
|
78 |
+
col_name: ASSIN2 RTE
|
79 |
+
task_list:
|
80 |
+
- assin2_rte
|
81 |
+
metric: f1_macro
|
82 |
+
few_shot: 15
|
83 |
+
limit: null
|
84 |
+
baseline: 50.0 #random baseline
|
85 |
+
human_baseline: null
|
86 |
+
expert_human_baseline: null
|
87 |
+
description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual -
|
88 |
+
Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN,
|
89 |
+
an evaluation shared task in the scope of the computational processing
|
90 |
+
of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language
|
91 |
+
Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
|
92 |
+
other text (hypothesis)."
|
93 |
+
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
|
94 |
+
sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
|
95 |
+
assin2_sts:
|
96 |
+
benchmark: assin2_sts
|
97 |
+
col_name: ASSIN2 STS
|
98 |
+
task_list:
|
99 |
+
- assin2_sts
|
100 |
+
metric: pearson
|
101 |
+
few_shot: 15
|
102 |
+
limit: null
|
103 |
+
baseline: 0.0 #random baseline
|
104 |
+
human_baseline: null
|
105 |
+
expert_human_baseline: null
|
106 |
+
description: "Same as dataset as above. Semantic Textual Similarity (STS)
|
107 |
+
‘measures the degree of semantic equivalence between two sentences’."
|
108 |
+
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
|
109 |
+
sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
|
110 |
+
faquad_nli:
|
111 |
+
benchmark: faquad_nli
|
112 |
+
col_name: FAQUAD NLI
|
113 |
+
task_list:
|
114 |
+
- faquad_nli
|
115 |
+
metric: f1_macro
|
116 |
+
few_shot: 15
|
117 |
+
limit: null
|
118 |
+
baseline: 45.6 #random baseline
|
119 |
+
human_baseline: null
|
120 |
+
expert_human_baseline: null
|
121 |
+
description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the
|
122 |
+
Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of
|
123 |
+
abundant questions sent by academics whose answers are found in available institutional
|
124 |
+
documents in the Brazilian higher education system. It consists of 900 questions about
|
125 |
+
249 reading passages taken from 18 official documents of a computer science college
|
126 |
+
from a Brazilian federal university and 21 Wikipedia articles related to the
|
127 |
+
Brazilian higher education system. FaQuAD-NLI is a modified version of the
|
128 |
+
FaQuAD dataset that repurposes the question answering task as a textual
|
129 |
+
entailment task between a question and its possible answers."
|
130 |
+
link: https://huggingface.co/datasets/ruanchaves/faquad-nli
|
131 |
+
sources: ["https://github.com/liafacom/faquad/"]
|
132 |
+
sparrow_pt:
|
133 |
+
benchmark: sparrow_pt
|
134 |
+
col_name: Sparrow POR
|
135 |
+
task_list:
|
136 |
+
- sparrow_emotion-2021-cortiz-por
|
137 |
+
- sparrow_hate-2019-fortuna-por
|
138 |
+
- sparrow_sentiment-2016-mozetic-por
|
139 |
+
- sparrow_sentiment-2018-brum-por
|
140 |
+
metric: f1_macro
|
141 |
+
few_shot: 15
|
142 |
+
limit: 500
|
143 |
+
baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
|
144 |
+
human_baseline: null
|
145 |
+
expert_human_baseline: null
|
146 |
+
description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding.
|
147 |
+
SPARROW comprises 169 datasets encompassing 64 different languages,
|
148 |
+
this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
|
149 |
+
One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021)
|
150 |
+
and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
|
151 |
+
All were extracted and manually annotated from Twitter/X."
|
152 |
+
link: https://huggingface.co/datasets/UBC-NLP/sparrow
|
153 |
+
sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]
|