open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

open_pt_llm_leaderboard / tasks_config /pt_config.yaml

eduagarcia

Portuguese Tasks configs and fix bugs

c0fa950 10 months ago

raw

history blame

7.83 kB

	version: 1.0.0
	config:
	REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
	QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
	RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
	RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
	DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
	PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
	IS_PUBLIC: true
	LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
	GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
	TRUST_REMOTE_CODE: true
	tasks:
	enem_challenge:
	benchmark: enem_challenge
	col_name: ENEM
	task_list:
	- enem_challenge
	metric: acc
	few_shot: 3
	limit: null
	baseline: 20.0 #random baseline
	#https://www.sejalguem.com/enem
	#https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
	human_baseline: 35.0 # ~60 / 180 acertos - nota ~500
	expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
	description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
	level exam widely applied every year by the Brazilian government to students that
	wish to undertake a University degree. This dataset contains 1,430 questions that don't require
	image understanding of the exams from 2010 to 2018, 2022 and 2023."
	link: https://huggingface.co/datasets/eduagarcia/enem_challenge
	sources: ["https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
	baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
	bluex:
	benchmark: bluex
	col_name: BLUEX
	task_list:
	- bluex
	metric: acc
	few_shot: 3
	limit: null
	baseline: 22.5 #random baseline
	#https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99
	#https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4% - ~77% @ top-.99
	human_baseline: 50.0
	expert_human_baseline: 82.5
	description: "BLUEX is a multimodal dataset consisting of the two leading
	university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP),
	spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"
	link: https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images
	sources: ["https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
	baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
	oab_exams:
	benchmark: oab_exams
	col_name: OAB Exams
	task_list:
	- oab_exams
	metric: acc
	few_shot: 3
	limit: null
	baseline: 25.0 #random baseline
	#https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
	# http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
	# Acertou +70% = 17214 / 638500 = top-97,5%
	# desvio top-97,5% -> 46 - 70.0% = 24
	# z score 97,5% ~ 1,9675
	# desvio padrao estimado -> 12,2
	# top 99% = 46 + 2,33*12,2 = ~75.0
	human_baseline: 46.0
	expert_human_baseline: 75.0
	description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
	Association's exams, from 2010 to 2018.
	link: https://huggingface.co/datasets/eduagarcia/oab_exams
	sources: ["https://github.com/legal-nlp/oab-exams"]
	baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
	assin2_rte:
	benchmark: assin2_rte
	col_name: ASSIN2 RTE
	task_list:
	- assin2_rte
	metric: f1_macro
	few_shot: 15
	limit: null
	baseline: 50.0 #random baseline
	human_baseline: null
	expert_human_baseline: null
	description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual -
	Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN,
	an evaluation shared task in the scope of the computational processing
	of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language
	Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
	other text (hypothesis)."
	link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
	sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
	assin2_sts:
	benchmark: assin2_sts
	col_name: ASSIN2 STS
	task_list:
	- assin2_sts
	metric: pearson
	few_shot: 15
	limit: null
	baseline: 0.0 #random baseline
	human_baseline: null
	expert_human_baseline: null
	description: "Same as dataset as above. Semantic Textual Similarity (STS)
	‘measures the degree of semantic equivalence between two sentences’."
	link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
	sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
	faquad_nli:
	benchmark: faquad_nli
	col_name: FAQUAD NLI
	task_list:
	- faquad_nli
	metric: f1_macro
	few_shot: 15
	limit: null
	baseline: 45.6 #random baseline
	human_baseline: null
	expert_human_baseline: null
	description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the
	Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of
	abundant questions sent by academics whose answers are found in available institutional
	documents in the Brazilian higher education system. It consists of 900 questions about
	249 reading passages taken from 18 official documents of a computer science college
	from a Brazilian federal university and 21 Wikipedia articles related to the
	Brazilian higher education system. FaQuAD-NLI is a modified version of the
	FaQuAD dataset that repurposes the question answering task as a textual
	entailment task between a question and its possible answers."
	link: https://huggingface.co/datasets/ruanchaves/faquad-nli
	sources: ["https://github.com/liafacom/faquad/"]
	sparrow_pt:
	benchmark: sparrow_pt
	col_name: Sparrow POR
	task_list:
	- sparrow_emotion-2021-cortiz-por
	- sparrow_hate-2019-fortuna-por
	- sparrow_sentiment-2016-mozetic-por
	- sparrow_sentiment-2018-brum-por
	metric: f1_macro
	few_shot: 15
	limit: 500
	baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
	human_baseline: null
	expert_human_baseline: null
	description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding.
	SPARROW comprises 169 datasets encompassing 64 different languages,
	this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
	One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021)
	and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
	All were extracted and manually annotated from Twitter/X."
	link: https://huggingface.co/datasets/UBC-NLP/sparrow
	sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]