open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

eduagarcia commited on Apr 15

Commit

c14ac9f

•

1 Parent(s): 1dbfacb

add citations

Browse files

Files changed (4) hide show

app.py +0 -2
src/display/about.py +8 -94
src/display/utils.py +1 -0
tasks_config/pt_config.yaml +126 -0

app.py CHANGED Viewed

@@ -516,7 +516,6 @@ with demo:
             )
         with gr.TabItem("⏳ Changelog", elem_id="llm-benchmark-tab-table", id=5):
             gr.Markdown(CHANGELOG_TEXT, elem_classes="markdown-text")
-    """ #TODO: FIX CITATIONS
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
@@ -526,7 +525,6 @@ with demo:
                 elem_id="citation-button",
                 show_copy_button=True,
             )
-    """
 def update_dynamic_files_wrapper():
     try:

             )
         with gr.TabItem("⏳ Changelog", elem_id="llm-benchmark-tab-table", id=5):
             gr.Markdown(CHANGELOG_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 elem_id="citation-button",
                 show_copy_button=True,
             )
 def update_dynamic_files_wrapper():
     try:

src/display/about.py CHANGED Viewed

@@ -199,97 +199,11 @@ If everything is done, check you can launch the EleutherAIHarness on your model
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
-@misc{open-llm-leaderboard,
-  author = {Edward Beeching and Clémentine Fourrier and Nathan Habib and Sheon Han and Nathan Lambert and Nazneen Rajani and Omar Sanseviero and Lewis Tunstall and Thomas Wolf},
-  title = {Open LLM Leaderboard},
-  year = {2023},
-  publisher = {Hugging Face},
-  howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
-}
-@software{eval-harness,
-  author       = {Gao, Leo and
-                  Tow, Jonathan and
-                  Biderman, Stella and
-                  Black, Sid and
-                  DiPofi, Anthony and
-                  Foster, Charles and
-                  Golding, Laurence and
-                  Hsu, Jeffrey and
-                  McDonell, Kyle and
-                  Muennighoff, Niklas and
-                  Phang, Jason and
-                  Reynolds, Laria and
-                  Tang, Eric and
-                  Thite, Anish and
-                  Wang, Ben and
-                  Wang, Kevin and
-                  Zou, Andy},
-  title        = {A framework for few-shot language model evaluation},
-  month        = sep,
-  year         = 2021,
-  publisher    = {Zenodo},
-  version      = {v0.0.1},
-  doi          = {10.5281/zenodo.5371628},
-  url          = {https://doi.org/10.5281/zenodo.5371628}
-}
-@misc{clark2018think,
-      title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
-      author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
-      year={2018},
-      eprint={1803.05457},
-      archivePrefix={arXiv},
-      primaryClass={cs.AI}
-}
-@misc{zellers2019hellaswag,
-      title={HellaSwag: Can a Machine Really Finish Your Sentence?},
-      author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
-      year={2019},
-      eprint={1905.07830},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-@misc{hendrycks2021measuring,
-      title={Measuring Massive Multitask Language Understanding},
-      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
-      year={2021},
-      eprint={2009.03300},
-      archivePrefix={arXiv},
-      primaryClass={cs.CY}
-}
-@misc{lin2022truthfulqa,
-      title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
-      author={Stephanie Lin and Jacob Hilton and Owain Evans},
-      year={2022},
-      eprint={2109.07958},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-@misc{DBLP:journals/corr/abs-1907-10641,
-      title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
-      author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
-      year={2019},
-      eprint={1907.10641},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-@misc{DBLP:journals/corr/abs-2110-14168,
-      title={Training Verifiers to Solve Math Word Problems},
-      author={Karl Cobbe and
-                  Vineet Kosaraju and
-                  Mohammad Bavarian and
-                  Mark Chen and
-                  Heewoo Jun and
-                  Lukasz Kaiser and
-                  Matthias Plappert and
-                  Jerry Tworek and
-                  Jacob Hilton and
-                  Reiichiro Nakano and
-                  Christopher Hesse and
-                  John Schulman},
-      year={2021},
-      eprint={2110.14168},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-"""

 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = ""
+if 'citation' in TASK_CONFIG['readme']:
+      CITATION_BUTTON_TEXT += TASK_CONFIG['readme']['citation'] + '\n'
+for task in Tasks:
+      task = task.value
+      if task.citation is not None:
+            CITATION_BUTTON_TEXT += task.citation

src/display/utils.py CHANGED Viewed

@@ -27,6 +27,7 @@ class Task:
     description: str = None
     sources: List[str] = None
     baseline_sources: List[str] = None
 Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})

     description: str = None
     sources: List[str] = None
     baseline_sources: List[str] = None
+    citation: str = None
 Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})

tasks_config/pt_config.yaml CHANGED Viewed

@@ -43,6 +43,14 @@ readme:
     portuguese benchmarks.
     Add the results to your model card: [🧐 Open Portuguese LLM Leaderboard Results PR Opener](https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard)
 tasks:
   enem_challenge:
     benchmark: enem_challenge
@@ -64,6 +72,31 @@ tasks:
     link: https://www.ime.usp.br/~ddm/project/enem/ENEM-GuidingTest.pdf
     sources: ["https://huggingface.co/datasets/eduagarcia/enem_challenge", "https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
     baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
   bluex:
     benchmark: bluex
     col_name: BLUEX
@@ -83,6 +116,15 @@ tasks:
     link: https://arxiv.org/abs/2307.05410
     sources: ["https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images", "https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
     baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
   oab_exams:
     benchmark: oab_exams
     col_name: OAB Exams
@@ -106,6 +148,16 @@ tasks:
     link: https://arxiv.org/abs/1712.05128
     sources: ["https://huggingface.co/datasets/eduagarcia/oab_exams", "https://github.com/legal-nlp/oab-exams"]
     baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
   assin2_rte:
     benchmark: assin2_rte
     col_name: ASSIN2 RTE
@@ -125,6 +177,15 @@ tasks:
     other text (hypothesis)."
     link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39
     sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
   assin2_sts:
     benchmark: assin2_sts
     col_name: ASSIN2 STS
@@ -162,6 +223,27 @@ tasks:
     entailment task between a question and its possible answers."
     link: https://ieeexplore.ieee.org/abstract/document/8923668
     sources: ["https://github.com/liafacom/faquad/", "https://huggingface.co/datasets/ruanchaves/faquad-nli"]
   hatebr_offensive:
     benchmark: hatebr_offensive
     col_name: HateBR
@@ -179,6 +261,22 @@ tasks:
     versus non-offensive comments)."
     link: https://arxiv.org/abs/2103.14972
     sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
   portuguese_hate_speech:
     benchmark: portuguese_hate_speech
     col_name: PT Hate Speech
@@ -193,6 +291,21 @@ tasks:
     description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
     link: https://aclanthology.org/W19-3510/
     sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
   tweetsentbr:
     benchmark: tweetsentbr
     col_name: tweetSentBR
@@ -210,4 +323,17 @@ tasks:
     in one of the three following classes: Positive, Negative, Neutral."
     link: https://arxiv.org/abs/1712.08917
     sources: ["https://bitbucket.org/HBrum/tweetsentbr", "eduagarcia/tweetsentbr_fewshot"]

     portuguese benchmarks.
     Add the results to your model card: [🧐 Open Portuguese LLM Leaderboard Results PR Opener](https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard)
+  citation: |
+    @misc{open-pt-llm-leaderboard,
+      author = {Garcia, Eduardo A. S.},
+      title = {Open Portuguese LLM Leaderboard},
+      year = {2024},
+      publisher = {Hugging Face},
+      howpublished = "\url{https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard}"
+    }
 tasks:
   enem_challenge:
     benchmark: enem_challenge
     link: https://www.ime.usp.br/~ddm/project/enem/ENEM-GuidingTest.pdf
     sources: ["https://huggingface.co/datasets/eduagarcia/enem_challenge", "https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
     baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
+    citation: |
+      @InProceedings{ENEM-Challenge,
+        author = {Silveira, Igor Cataneo and Mau\'a, Denis Deratani},
+        booktitle = {Proceedings of the 6th Brazilian Conference on Intelligent Systems},
+        series = {BRACIS},
+        title = {University Entrance Exam as a Guiding Test for Artificial Intelligence},
+        pages = {426--431},
+        year = {2017}
+      }
+      @misc{nunes2023evaluating,
+        title={Evaluating GPT-3.5 and GPT-4 Models on Brazilian University Admission Exams},
+        author={Desnes Nunes and Ricardo Primi and Ramon Pires and Roberto Lotufo and Rodrigo Nogueira},
+        year={2023},
+        eprint={2303.17003},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL}
+      }
+      @misc{pires2023evaluating,
+        title={Evaluating GPT-4's Vision Capabilities on Brazilian University Admission Exams},
+        author={Ramon Pires and Thales Sales Almeida and Hugo Abonizio and Rodrigo Nogueira},
+        year={2023},
+        eprint={2311.14169},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL}
+      }
   bluex:
     benchmark: bluex
     col_name: BLUEX
     link: https://arxiv.org/abs/2307.05410
     sources: ["https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images", "https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
     baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
+    citation: |
+      @misc{almeida2023bluex,
+        title={BLUEX: A benchmark based on Brazilian Leading Universities Entrance eXams},
+        author={Thales Sales Almeida and Thiago Laitz and Giovana K. Bonás and Rodrigo Nogueira},
+        year={2023},
+        eprint={2307.05410},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL}
+      }
   oab_exams:
     benchmark: oab_exams
     col_name: OAB Exams
     link: https://arxiv.org/abs/1712.05128
     sources: ["https://huggingface.co/datasets/eduagarcia/oab_exams", "https://github.com/legal-nlp/oab-exams"]
     baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
+    citation: |
+      @inproceedings{d2017passing,
+        title={Passing the Brazilian OAB Exam: Data Preparation and Some Experiments1},
+        author={d RADEMAKER, Alexandre},
+        booktitle={Legal Knowledge and Information Systems: JURIX 2017: The Thirtieth Annual Conference},
+        volume={302},
+        pages={89},
+        year={2017},
+        organization={IOS Press}
+      }
   assin2_rte:
     benchmark: assin2_rte
     col_name: ASSIN2 RTE
     other text (hypothesis)."
     link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39
     sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
+    citation: |
+      @inproceedings{real2020assin,
+        title={The assin 2 shared task: a quick overview},
+        author={Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo},
+        booktitle={International Conference on Computational Processing of the Portuguese Language},
+        pages={406--412},
+        year={2020},
+        organization={Springer}
+      }
   assin2_sts:
     benchmark: assin2_sts
     col_name: ASSIN2 STS
     entailment task between a question and its possible answers."
     link: https://ieeexplore.ieee.org/abstract/document/8923668
     sources: ["https://github.com/liafacom/faquad/", "https://huggingface.co/datasets/ruanchaves/faquad-nli"]
+    citation: |
+      @inproceedings{8923668,
+        author={Sayama, Hélio Fonseca and Araujo, Anderson Viçoso and Fernandes, Eraldo Rezende},
+        booktitle={2019 8th Brazilian Conference on Intelligent Systems (BRACIS)},
+        title={FaQuAD: Reading Comprehension Dataset in the Domain of Brazilian Higher Education},
+        year={2019},
+        volume={},
+        number={},
+        pages={443-448},
+        keywords={Training;Context modeling;Encyclopedias;Electronic publishing;Internet;Natural Language Processing;Machine Reading Comprehension;Dataset},
+        doi={10.1109/BRACIS.2019.00084}
+      }
+      @software{Chaves_Rodrigues_napolab_2023,
+        author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
+        doi = {10.5281/zenodo.7781848},
+        month = {3},
+        title = {{Natural Portuguese Language Benchmark (Napolab)}},
+        url = {https://github.com/ruanchaves/napolab},
+        version = {1.0.0},
+        year = {2023}
+      }
   hatebr_offensive:
     benchmark: hatebr_offensive
     col_name: HateBR
     versus non-offensive comments)."
     link: https://arxiv.org/abs/2103.14972
     sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
+    citation: |
+      @inproceedings{vargas-etal-2022-hatebr,
+        title = "{H}ate{BR}: A Large Expert Annotated Corpus of {B}razilian {I}nstagram Comments for Offensive Language and Hate Speech Detection",
+        author = "Vargas, Francielle  and
+          Carvalho, Isabelle  and
+          Rodrigues de G{\'o}es, Fabiana  and
+          Pardo, Thiago  and
+          Benevenuto, Fabr{\'\i}cio",
+        booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
+        month = jun,
+        year = "2022",
+        address = "Marseille, France",
+        publisher = "European Language Resources Association",
+        url = "https://aclanthology.org/2022.lrec-1.777",
+        pages = "7174--7183"
+      }
   portuguese_hate_speech:
     benchmark: portuguese_hate_speech
     col_name: PT Hate Speech
     description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
     link: https://aclanthology.org/W19-3510/
     sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
+    citation: |
+      @inproceedings{fortuna-etal-2019-hierarchically,
+        title = "A Hierarchically-Labeled {P}ortuguese Hate Speech Dataset",
+        author = "Fortuna, Paula  and
+          Rocha da Silva, Jo{\~a}o  and
+          Soler-Company, Juan  and
+          Wanner, Leo  and
+          Nunes, S{\'e}rgio",
+        booktitle = "Proceedings of the 3rd Workshop on Abusive Language Online (ALW3)",
+        year = "2019",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/W19-3510",
+        doi = "10.18653/v1/W19-3510",
+        pages = "94--104",
+      }
   tweetsentbr:
     benchmark: tweetsentbr
     col_name: tweetSentBR
     in one of the three following classes: Positive, Negative, Neutral."
     link: https://arxiv.org/abs/1712.08917
     sources: ["https://bitbucket.org/HBrum/tweetsentbr", "eduagarcia/tweetsentbr_fewshot"]
+    citation: |
+      @InProceedings{BRUM18.389,
+        author = {Henrico Brum and Maria das Gra\c{c}as Volpe Nunes},
+        title = "{Building a Sentiment Corpus of Tweets in Brazilian Portuguese}",
+        booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
+        year = {2018},
+        month = {May 7-12, 2018},
+        address = {Miyazaki, Japan},
+        editor = {Nicoletta Calzolari (Conference chair) and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Koiti Hasida and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and HÚlŔne Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis and Takenobu Tokunaga},
+        publisher = {European Language Resources Association (ELRA)},
+        isbn = {979-10-95546-00-9},
+        language = {english}
+      }