eduagarcia's picture
Better about's
67cd6fc
raw
history blame
9.75 kB
version: 1.0.0
config:
REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
IS_PUBLIC: true
LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
TRUST_REMOTE_CODE: true
readme:
general_description: |
📐 The 🚀 Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
Large Language Models (LLMs) in the Portuguese language across a variety of tasks
and datasets.
The leaderboard is open to submissions of models from the community
and is designed to be a resource for researchers, practitioners, and enthusiasts
interested in the development and evaluation of LLMs for the Portuguese language.
If you have any questions, suggestions, or would like to contribute to the leaderboard,
please feel free to reach out at [@eduagarcia](https://linktr.ee/eduagarcia).
support_description: |
This leaderboard is made possible by the support of the
[Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
[Federal University of Goiás (UFG)](https://international.ufg.br/).
about_description: |
The 🚀 Open PT-LLM Leaderboard is a benchmark for the evaluation of
Large Language Models (LLMs) in the Portuguese language.
The leaderboard is open to submissions of models from the community and
is designed to be a resource for researchers, practitioners, and enthusiasts interested
in the development and evaluation of LLMs for the Portuguese language.
Supported by the [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the
[Federal University of Goiás (UFG)](https://international.ufg.br/), this leaderboard
operates on a backend of Nvidia A100-80G GPUs. Evaluations are subject to
resource availability, which is not exclusive. Therefore, please be patient if
your model is in the queue. If you'd like to support the leaderboard, feel free to
reach out.
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with
portuguese benchmarks.
tasks:
enem_challenge:
benchmark: enem_challenge
col_name: ENEM
task_list:
- enem_challenge
metric: acc
few_shot: 3
limit: null
baseline: 20.0 #random baseline
#https://www.sejalguem.com/enem
#https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
human_baseline: 35.0 # ~60 / 180 acertos - nota ~500
expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
level exam widely applied every year by the Brazilian government to students that
wish to undertake a University degree. This dataset contains 1,430 questions that don't require
image understanding of the exams from 2010 to 2018, 2022 and 2023."
link: https://huggingface.co/datasets/eduagarcia/enem_challenge
sources: ["https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
bluex:
benchmark: bluex
col_name: BLUEX
task_list:
- bluex
metric: acc
few_shot: 3
limit: null
baseline: 22.5 #random baseline
#https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99
#https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4% - ~77% @ top-.99
human_baseline: 50.0
expert_human_baseline: 82.5
description: "BLUEX is a multimodal dataset consisting of the two leading
university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP),
spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"
link: https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images
sources: ["https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
oab_exams:
benchmark: oab_exams
col_name: OAB Exams
task_list:
- oab_exams
metric: acc
few_shot: 3
limit: null
baseline: 25.0 #random baseline
#https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
# http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
# Acertou +70% = 17214 / 638500 = top-97,5%
# desvio top-97,5% -> 46 - 70.0% = 24
# z score 97,5% ~ 1,9675
# desvio padrao estimado -> 12,2
# top 99% = 46 + 2,33*12,2 = ~75.0
human_baseline: 46.0
expert_human_baseline: 75.0
description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
Association's exams, from 2010 to 2018.
link: https://huggingface.co/datasets/eduagarcia/oab_exams
sources: ["https://github.com/legal-nlp/oab-exams"]
baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
assin2_rte:
benchmark: assin2_rte
col_name: ASSIN2 RTE
task_list:
- assin2_rte
metric: f1_macro
few_shot: 15
limit: null
baseline: 50.0 #random baseline
human_baseline: null
expert_human_baseline: null
description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual -
Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN,
an evaluation shared task in the scope of the computational processing
of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language
Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
other text (hypothesis)."
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
assin2_sts:
benchmark: assin2_sts
col_name: ASSIN2 STS
task_list:
- assin2_sts
metric: pearson
few_shot: 15
limit: null
baseline: 0.0 #random baseline
human_baseline: null
expert_human_baseline: null
description: "Same as dataset as above. Semantic Textual Similarity (STS)
‘measures the degree of semantic equivalence between two sentences’."
link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
faquad_nli:
benchmark: faquad_nli
col_name: FAQUAD NLI
task_list:
- faquad_nli
metric: f1_macro
few_shot: 15
limit: null
baseline: 45.6 #random baseline
human_baseline: null
expert_human_baseline: null
description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the
Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of
abundant questions sent by academics whose answers are found in available institutional
documents in the Brazilian higher education system. It consists of 900 questions about
249 reading passages taken from 18 official documents of a computer science college
from a Brazilian federal university and 21 Wikipedia articles related to the
Brazilian higher education system. FaQuAD-NLI is a modified version of the
FaQuAD dataset that repurposes the question answering task as a textual
entailment task between a question and its possible answers."
link: https://huggingface.co/datasets/ruanchaves/faquad-nli
sources: ["https://github.com/liafacom/faquad/"]
sparrow_pt:
benchmark: sparrow_pt
col_name: Sparrow POR
task_list:
- sparrow_emotion-2021-cortiz-por
- sparrow_hate-2019-fortuna-por
- sparrow_sentiment-2016-mozetic-por
- sparrow_sentiment-2018-brum-por
metric: f1_macro
few_shot: 25
limit: 500
baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
human_baseline: null
expert_human_baseline: null
description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding.
SPARROW comprises 169 datasets encompassing 64 different languages,
this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021)
and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
All were extracted and manually annotated from Twitter/X."
link: https://huggingface.co/datasets/UBC-NLP/sparrow
sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]