eduagarcia commited on
Commit
c14ac9f
1 Parent(s): 1dbfacb

add citations

Browse files
app.py CHANGED
@@ -516,7 +516,6 @@ with demo:
516
  )
517
  with gr.TabItem("⏳ Changelog", elem_id="llm-benchmark-tab-table", id=5):
518
  gr.Markdown(CHANGELOG_TEXT, elem_classes="markdown-text")
519
- """ #TODO: FIX CITATIONS
520
  with gr.Row():
521
  with gr.Accordion("📙 Citation", open=False):
522
  citation_button = gr.Textbox(
@@ -526,7 +525,6 @@ with demo:
526
  elem_id="citation-button",
527
  show_copy_button=True,
528
  )
529
- """
530
 
531
  def update_dynamic_files_wrapper():
532
  try:
 
516
  )
517
  with gr.TabItem("⏳ Changelog", elem_id="llm-benchmark-tab-table", id=5):
518
  gr.Markdown(CHANGELOG_TEXT, elem_classes="markdown-text")
 
519
  with gr.Row():
520
  with gr.Accordion("📙 Citation", open=False):
521
  citation_button = gr.Textbox(
 
525
  elem_id="citation-button",
526
  show_copy_button=True,
527
  )
 
528
 
529
  def update_dynamic_files_wrapper():
530
  try:
src/display/about.py CHANGED
@@ -199,97 +199,11 @@ If everything is done, check you can launch the EleutherAIHarness on your model
199
  """
200
 
201
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
202
- CITATION_BUTTON_TEXT = r"""
203
- @misc{open-llm-leaderboard,
204
- author = {Edward Beeching and Clémentine Fourrier and Nathan Habib and Sheon Han and Nathan Lambert and Nazneen Rajani and Omar Sanseviero and Lewis Tunstall and Thomas Wolf},
205
- title = {Open LLM Leaderboard},
206
- year = {2023},
207
- publisher = {Hugging Face},
208
- howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
209
- }
210
- @software{eval-harness,
211
- author = {Gao, Leo and
212
- Tow, Jonathan and
213
- Biderman, Stella and
214
- Black, Sid and
215
- DiPofi, Anthony and
216
- Foster, Charles and
217
- Golding, Laurence and
218
- Hsu, Jeffrey and
219
- McDonell, Kyle and
220
- Muennighoff, Niklas and
221
- Phang, Jason and
222
- Reynolds, Laria and
223
- Tang, Eric and
224
- Thite, Anish and
225
- Wang, Ben and
226
- Wang, Kevin and
227
- Zou, Andy},
228
- title = {A framework for few-shot language model evaluation},
229
- month = sep,
230
- year = 2021,
231
- publisher = {Zenodo},
232
- version = {v0.0.1},
233
- doi = {10.5281/zenodo.5371628},
234
- url = {https://doi.org/10.5281/zenodo.5371628}
235
- }
236
- @misc{clark2018think,
237
- title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
238
- author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
239
- year={2018},
240
- eprint={1803.05457},
241
- archivePrefix={arXiv},
242
- primaryClass={cs.AI}
243
- }
244
- @misc{zellers2019hellaswag,
245
- title={HellaSwag: Can a Machine Really Finish Your Sentence?},
246
- author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
247
- year={2019},
248
- eprint={1905.07830},
249
- archivePrefix={arXiv},
250
- primaryClass={cs.CL}
251
- }
252
- @misc{hendrycks2021measuring,
253
- title={Measuring Massive Multitask Language Understanding},
254
- author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
255
- year={2021},
256
- eprint={2009.03300},
257
- archivePrefix={arXiv},
258
- primaryClass={cs.CY}
259
- }
260
- @misc{lin2022truthfulqa,
261
- title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
262
- author={Stephanie Lin and Jacob Hilton and Owain Evans},
263
- year={2022},
264
- eprint={2109.07958},
265
- archivePrefix={arXiv},
266
- primaryClass={cs.CL}
267
- }
268
- @misc{DBLP:journals/corr/abs-1907-10641,
269
- title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
270
- author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
271
- year={2019},
272
- eprint={1907.10641},
273
- archivePrefix={arXiv},
274
- primaryClass={cs.CL}
275
- }
276
- @misc{DBLP:journals/corr/abs-2110-14168,
277
- title={Training Verifiers to Solve Math Word Problems},
278
- author={Karl Cobbe and
279
- Vineet Kosaraju and
280
- Mohammad Bavarian and
281
- Mark Chen and
282
- Heewoo Jun and
283
- Lukasz Kaiser and
284
- Matthias Plappert and
285
- Jerry Tworek and
286
- Jacob Hilton and
287
- Reiichiro Nakano and
288
- Christopher Hesse and
289
- John Schulman},
290
- year={2021},
291
- eprint={2110.14168},
292
- archivePrefix={arXiv},
293
- primaryClass={cs.CL}
294
- }
295
- """
 
199
  """
200
 
201
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
202
+
203
+ CITATION_BUTTON_TEXT = ""
204
+ if 'citation' in TASK_CONFIG['readme']:
205
+ CITATION_BUTTON_TEXT += TASK_CONFIG['readme']['citation'] + '\n'
206
+ for task in Tasks:
207
+ task = task.value
208
+ if task.citation is not None:
209
+ CITATION_BUTTON_TEXT += task.citation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -27,6 +27,7 @@ class Task:
27
  description: str = None
28
  sources: List[str] = None
29
  baseline_sources: List[str] = None
 
30
 
31
  Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})
32
 
 
27
  description: str = None
28
  sources: List[str] = None
29
  baseline_sources: List[str] = None
30
+ citation: str = None
31
 
32
  Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})
33
 
tasks_config/pt_config.yaml CHANGED
@@ -43,6 +43,14 @@ readme:
43
  portuguese benchmarks.
44
 
45
  Add the results to your model card: [🧐 Open Portuguese LLM Leaderboard Results PR Opener](https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard)
 
 
 
 
 
 
 
 
46
  tasks:
47
  enem_challenge:
48
  benchmark: enem_challenge
@@ -64,6 +72,31 @@ tasks:
64
  link: https://www.ime.usp.br/~ddm/project/enem/ENEM-GuidingTest.pdf
65
  sources: ["https://huggingface.co/datasets/eduagarcia/enem_challenge", "https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
66
  baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  bluex:
68
  benchmark: bluex
69
  col_name: BLUEX
@@ -83,6 +116,15 @@ tasks:
83
  link: https://arxiv.org/abs/2307.05410
84
  sources: ["https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images", "https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
85
  baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
 
 
 
 
 
 
 
 
 
86
  oab_exams:
87
  benchmark: oab_exams
88
  col_name: OAB Exams
@@ -106,6 +148,16 @@ tasks:
106
  link: https://arxiv.org/abs/1712.05128
107
  sources: ["https://huggingface.co/datasets/eduagarcia/oab_exams", "https://github.com/legal-nlp/oab-exams"]
108
  baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
 
 
 
 
 
 
 
 
 
 
109
  assin2_rte:
110
  benchmark: assin2_rte
111
  col_name: ASSIN2 RTE
@@ -125,6 +177,15 @@ tasks:
125
  other text (hypothesis)."
126
  link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39
127
  sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
 
 
 
 
 
 
 
 
 
128
  assin2_sts:
129
  benchmark: assin2_sts
130
  col_name: ASSIN2 STS
@@ -162,6 +223,27 @@ tasks:
162
  entailment task between a question and its possible answers."
163
  link: https://ieeexplore.ieee.org/abstract/document/8923668
164
  sources: ["https://github.com/liafacom/faquad/", "https://huggingface.co/datasets/ruanchaves/faquad-nli"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  hatebr_offensive:
166
  benchmark: hatebr_offensive
167
  col_name: HateBR
@@ -179,6 +261,22 @@ tasks:
179
  versus non-offensive comments)."
180
  link: https://arxiv.org/abs/2103.14972
181
  sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  portuguese_hate_speech:
183
  benchmark: portuguese_hate_speech
184
  col_name: PT Hate Speech
@@ -193,6 +291,21 @@ tasks:
193
  description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
194
  link: https://aclanthology.org/W19-3510/
195
  sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  tweetsentbr:
197
  benchmark: tweetsentbr
198
  col_name: tweetSentBR
@@ -210,4 +323,17 @@ tasks:
210
  in one of the three following classes: Positive, Negative, Neutral."
211
  link: https://arxiv.org/abs/1712.08917
212
  sources: ["https://bitbucket.org/HBrum/tweetsentbr", "eduagarcia/tweetsentbr_fewshot"]
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
 
43
  portuguese benchmarks.
44
 
45
  Add the results to your model card: [🧐 Open Portuguese LLM Leaderboard Results PR Opener](https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard)
46
+ citation: |
47
+ @misc{open-pt-llm-leaderboard,
48
+ author = {Garcia, Eduardo A. S.},
49
+ title = {Open Portuguese LLM Leaderboard},
50
+ year = {2024},
51
+ publisher = {Hugging Face},
52
+ howpublished = "\url{https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard}"
53
+ }
54
  tasks:
55
  enem_challenge:
56
  benchmark: enem_challenge
 
72
  link: https://www.ime.usp.br/~ddm/project/enem/ENEM-GuidingTest.pdf
73
  sources: ["https://huggingface.co/datasets/eduagarcia/enem_challenge", "https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
74
  baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
75
+ citation: |
76
+ @InProceedings{ENEM-Challenge,
77
+ author = {Silveira, Igor Cataneo and Mau\'a, Denis Deratani},
78
+ booktitle = {Proceedings of the 6th Brazilian Conference on Intelligent Systems},
79
+ series = {BRACIS},
80
+ title = {University Entrance Exam as a Guiding Test for Artificial Intelligence},
81
+ pages = {426--431},
82
+ year = {2017}
83
+ }
84
+ @misc{nunes2023evaluating,
85
+ title={Evaluating GPT-3.5 and GPT-4 Models on Brazilian University Admission Exams},
86
+ author={Desnes Nunes and Ricardo Primi and Ramon Pires and Roberto Lotufo and Rodrigo Nogueira},
87
+ year={2023},
88
+ eprint={2303.17003},
89
+ archivePrefix={arXiv},
90
+ primaryClass={cs.CL}
91
+ }
92
+ @misc{pires2023evaluating,
93
+ title={Evaluating GPT-4's Vision Capabilities on Brazilian University Admission Exams},
94
+ author={Ramon Pires and Thales Sales Almeida and Hugo Abonizio and Rodrigo Nogueira},
95
+ year={2023},
96
+ eprint={2311.14169},
97
+ archivePrefix={arXiv},
98
+ primaryClass={cs.CL}
99
+ }
100
  bluex:
101
  benchmark: bluex
102
  col_name: BLUEX
 
116
  link: https://arxiv.org/abs/2307.05410
117
  sources: ["https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images", "https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
118
  baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
119
+ citation: |
120
+ @misc{almeida2023bluex,
121
+ title={BLUEX: A benchmark based on Brazilian Leading Universities Entrance eXams},
122
+ author={Thales Sales Almeida and Thiago Laitz and Giovana K. Bonás and Rodrigo Nogueira},
123
+ year={2023},
124
+ eprint={2307.05410},
125
+ archivePrefix={arXiv},
126
+ primaryClass={cs.CL}
127
+ }
128
  oab_exams:
129
  benchmark: oab_exams
130
  col_name: OAB Exams
 
148
  link: https://arxiv.org/abs/1712.05128
149
  sources: ["https://huggingface.co/datasets/eduagarcia/oab_exams", "https://github.com/legal-nlp/oab-exams"]
150
  baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
151
+ citation: |
152
+ @inproceedings{d2017passing,
153
+ title={Passing the Brazilian OAB Exam: Data Preparation and Some Experiments1},
154
+ author={d RADEMAKER, Alexandre},
155
+ booktitle={Legal Knowledge and Information Systems: JURIX 2017: The Thirtieth Annual Conference},
156
+ volume={302},
157
+ pages={89},
158
+ year={2017},
159
+ organization={IOS Press}
160
+ }
161
  assin2_rte:
162
  benchmark: assin2_rte
163
  col_name: ASSIN2 RTE
 
177
  other text (hypothesis)."
178
  link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39
179
  sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
180
+ citation: |
181
+ @inproceedings{real2020assin,
182
+ title={The assin 2 shared task: a quick overview},
183
+ author={Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo},
184
+ booktitle={International Conference on Computational Processing of the Portuguese Language},
185
+ pages={406--412},
186
+ year={2020},
187
+ organization={Springer}
188
+ }
189
  assin2_sts:
190
  benchmark: assin2_sts
191
  col_name: ASSIN2 STS
 
223
  entailment task between a question and its possible answers."
224
  link: https://ieeexplore.ieee.org/abstract/document/8923668
225
  sources: ["https://github.com/liafacom/faquad/", "https://huggingface.co/datasets/ruanchaves/faquad-nli"]
226
+ citation: |
227
+ @inproceedings{8923668,
228
+ author={Sayama, Hélio Fonseca and Araujo, Anderson Viçoso and Fernandes, Eraldo Rezende},
229
+ booktitle={2019 8th Brazilian Conference on Intelligent Systems (BRACIS)},
230
+ title={FaQuAD: Reading Comprehension Dataset in the Domain of Brazilian Higher Education},
231
+ year={2019},
232
+ volume={},
233
+ number={},
234
+ pages={443-448},
235
+ keywords={Training;Context modeling;Encyclopedias;Electronic publishing;Internet;Natural Language Processing;Machine Reading Comprehension;Dataset},
236
+ doi={10.1109/BRACIS.2019.00084}
237
+ }
238
+ @software{Chaves_Rodrigues_napolab_2023,
239
+ author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
240
+ doi = {10.5281/zenodo.7781848},
241
+ month = {3},
242
+ title = {{Natural Portuguese Language Benchmark (Napolab)}},
243
+ url = {https://github.com/ruanchaves/napolab},
244
+ version = {1.0.0},
245
+ year = {2023}
246
+ }
247
  hatebr_offensive:
248
  benchmark: hatebr_offensive
249
  col_name: HateBR
 
261
  versus non-offensive comments)."
262
  link: https://arxiv.org/abs/2103.14972
263
  sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
264
+ citation: |
265
+ @inproceedings{vargas-etal-2022-hatebr,
266
+ title = "{H}ate{BR}: A Large Expert Annotated Corpus of {B}razilian {I}nstagram Comments for Offensive Language and Hate Speech Detection",
267
+ author = "Vargas, Francielle and
268
+ Carvalho, Isabelle and
269
+ Rodrigues de G{\'o}es, Fabiana and
270
+ Pardo, Thiago and
271
+ Benevenuto, Fabr{\'\i}cio",
272
+ booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
273
+ month = jun,
274
+ year = "2022",
275
+ address = "Marseille, France",
276
+ publisher = "European Language Resources Association",
277
+ url = "https://aclanthology.org/2022.lrec-1.777",
278
+ pages = "7174--7183"
279
+ }
280
  portuguese_hate_speech:
281
  benchmark: portuguese_hate_speech
282
  col_name: PT Hate Speech
 
291
  description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
292
  link: https://aclanthology.org/W19-3510/
293
  sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
294
+ citation: |
295
+ @inproceedings{fortuna-etal-2019-hierarchically,
296
+ title = "A Hierarchically-Labeled {P}ortuguese Hate Speech Dataset",
297
+ author = "Fortuna, Paula and
298
+ Rocha da Silva, Jo{\~a}o and
299
+ Soler-Company, Juan and
300
+ Wanner, Leo and
301
+ Nunes, S{\'e}rgio",
302
+ booktitle = "Proceedings of the 3rd Workshop on Abusive Language Online (ALW3)",
303
+ year = "2019",
304
+ publisher = "Association for Computational Linguistics",
305
+ url = "https://aclanthology.org/W19-3510",
306
+ doi = "10.18653/v1/W19-3510",
307
+ pages = "94--104",
308
+ }
309
  tweetsentbr:
310
  benchmark: tweetsentbr
311
  col_name: tweetSentBR
 
323
  in one of the three following classes: Positive, Negative, Neutral."
324
  link: https://arxiv.org/abs/1712.08917
325
  sources: ["https://bitbucket.org/HBrum/tweetsentbr", "eduagarcia/tweetsentbr_fewshot"]
326
+ citation: |
327
+ @InProceedings{BRUM18.389,
328
+ author = {Henrico Brum and Maria das Gra\c{c}as Volpe Nunes},
329
+ title = "{Building a Sentiment Corpus of Tweets in Brazilian Portuguese}",
330
+ booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
331
+ year = {2018},
332
+ month = {May 7-12, 2018},
333
+ address = {Miyazaki, Japan},
334
+ editor = {Nicoletta Calzolari (Conference chair) and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Koiti Hasida and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and HÚlŔne Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis and Takenobu Tokunaga},
335
+ publisher = {European Language Resources Association (ELRA)},
336
+ isbn = {979-10-95546-00-9},
337
+ language = {english}
338
+ }
339