eduagarcia commited on
Commit
c0fa950
1 Parent(s): 0266144

Portuguese Tasks configs and fix bugs

Browse files
.gitignore CHANGED
@@ -1,6 +1,6 @@
1
  venv/
2
  __pycache__/
3
- .env
4
  .ipynb_checkpoints
5
  *ipynb
6
  .vscode/
@@ -12,5 +12,8 @@ original_results/
12
  eval-queue/
13
  eval-results/
14
  dynamic-info/
 
 
 
15
 
16
  src/assets/model_counts.html
 
1
  venv/
2
  __pycache__/
3
+ .env*
4
  .ipynb_checkpoints
5
  *ipynb
6
  .vscode/
 
12
  eval-queue/
13
  eval-results/
14
  dynamic-info/
15
+ downloads/
16
+
17
+ tasks_config/legal_config.yaml
18
 
19
  src/assets/model_counts.html
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
  from typing import List
4
  import pandas as pd
5
  from yaml import safe_load
6
- from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -15,16 +15,17 @@ class Task:
15
  metric: str
16
  col_name: str
17
  baseline: float = 0.0
18
- human_baseline: float = 0.0
 
19
  few_shot: int = None
20
  limit: int = None
21
  task_list: List[str] = None
22
  link: str = None
23
  description: str = None
 
 
24
 
25
- with open('tasks_config.yaml', 'r', encoding='utf-8') as f:
26
- tasks_config = safe_load(f)
27
- Tasks = Enum('Tasks', {k: Task(**v) for k, v in tasks_config['tasks'].items()})
28
 
29
  # These classes are for user facing column names,
30
  # to avoid having to change them all around the code
@@ -108,11 +109,12 @@ baseline_row = {
108
  baseline_list = []
109
  for task in Tasks:
110
  baseline_row[task.value.col_name] = task.value.baseline
111
- if task.value.baseline is not None:
112
  baseline_list.append(task.value.baseline)
113
  baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
114
- if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
115
- baseline_row["original_benchmark_average"] = None
 
116
 
117
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
118
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -145,17 +147,17 @@ human_baseline_row = {
145
  AutoEvalColumn.license.name: "",
146
  AutoEvalColumn.still_on_hub.name: False,
147
  AutoEvalColumn.moe.name: False,
148
- AutoEvalColumn.eval_time.name: 0.0
149
  }
150
 
151
  baseline_list = []
152
  for task in Tasks:
153
  human_baseline_row[task.value.col_name] = task.value.human_baseline
154
- if task.value.human_baseline is not None:
155
  baseline_list.append(task.value.human_baseline)
156
  human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
157
- if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
158
- human_baseline_row["original_benchmark_average"] = None
159
 
160
  @dataclass
161
  class ModelDetails:
 
3
  from typing import List
4
  import pandas as pd
5
  from yaml import safe_load
6
+ from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, TASK_CONFIG
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
15
  metric: str
16
  col_name: str
17
  baseline: float = 0.0
18
+ human_baseline: float = None
19
+ expert_human_baseline: float = None
20
  few_shot: int = None
21
  limit: int = None
22
  task_list: List[str] = None
23
  link: str = None
24
  description: str = None
25
+ sources: List[str] = None
26
+ baseline_sources: List[str] = None
27
 
28
+ Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})
 
 
29
 
30
  # These classes are for user facing column names,
31
  # to avoid having to change them all around the code
 
109
  baseline_list = []
110
  for task in Tasks:
111
  baseline_row[task.value.col_name] = task.value.baseline
112
+ if task.value.baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
113
  baseline_list.append(task.value.baseline)
114
  baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
115
+
116
+ #if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
117
+ baseline_row["🤗 Leaderboard Average"] = None
118
 
119
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
120
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
 
147
  AutoEvalColumn.license.name: "",
148
  AutoEvalColumn.still_on_hub.name: False,
149
  AutoEvalColumn.moe.name: False,
150
+ AutoEvalColumn.eval_time.name: 0.0,
151
  }
152
 
153
  baseline_list = []
154
  for task in Tasks:
155
  human_baseline_row[task.value.col_name] = task.value.human_baseline
156
+ if task.value.human_baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
157
  baseline_list.append(task.value.human_baseline)
158
  human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
159
+ #if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
160
+ human_baseline_row["🤗 Leaderboard Average"] = None
161
 
162
  @dataclass
163
  class ModelDetails:
src/envs.py CHANGED
@@ -1,25 +1,44 @@
1
  import os
 
2
 
3
  from huggingface_hub import HfApi
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # clone / pull the lmeh eval data
6
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
7
 
8
- LEADERBOARD_NAME = os.getenv("LEADERBOARD_NAME", "Open LLM Leaderboard")
9
 
10
- REPO_ID = os.getenv("REPO_ID", "HuggingFaceH4/open_llm_leaderboard")
11
- QUEUE_REPO = os.getenv("QUEUE_REPO", "open-llm-leaderboard/requests")
12
- DYNAMIC_INFO_REPO = os.getenv("DYNAMIC_INFO_REPO", "open-llm-leaderboard/dynamic_model_information")
13
- RESULTS_REPO = os.getenv("RESULTS_REPO", "open-llm-leaderboard/results")
 
14
 
15
  PRIVATE_QUEUE_REPO = QUEUE_REPO
16
  PRIVATE_RESULTS_REPO = RESULTS_REPO
17
  #PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
18
  #PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
19
 
20
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
21
 
22
- CACHE_PATH=os.getenv("HF_HOME", ".")
23
 
24
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
25
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
@@ -29,18 +48,18 @@ DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
29
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
30
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
31
 
32
- PATH_TO_COLLECTION = os.getenv("PATH_TO_COLLECTION", "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03")
33
 
34
  # Rate limit variables
35
- RATE_LIMIT_PERIOD = int(os.getenv("RATE_LIMIT_PERIOD", 7))
36
- RATE_LIMIT_QUOTA = int(os.getenv("RATE_LIMIT_QUOTA", 5))
37
- HAS_HIGHER_RATE_LIMIT = os.environ.get("HAS_HIGHER_RATE_LIMIT", "TheBloke").split(',')
38
 
39
- TRUST_REMOTE_CODE = bool(os.getenv("TRUST_REMOTE_CODE", False))
40
 
41
  #Set if you want to get an extra field with the average eval results from the HF leaderboard
42
- GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(os.getenv("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
43
- ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = os.getenv("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
44
  ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
45
 
46
  API = HfApi(token=H4_TOKEN)
 
1
  import os
2
+ from yaml import safe_load
3
 
4
  from huggingface_hub import HfApi
5
 
6
+ TASK_CONFIG_NAME = os.getenv("TASK_CONFIG", "pt_config")
7
+ TASK_CONFIG_PATH = os.path.join('tasks_config', TASK_CONFIG_NAME + ".yaml")
8
+ with open(TASK_CONFIG_PATH, 'r', encoding='utf-8') as f:
9
+ TASK_CONFIG = safe_load(f)
10
+
11
+ def get_config(name, default):
12
+ res = None
13
+
14
+ if name in os.environ:
15
+ res = os.environ[name]
16
+ elif 'config' in TASK_CONFIG:
17
+ res = TASK_CONFIG['config'].get(name, None)
18
+
19
+ if res is None:
20
+ return default
21
+ return res
22
+
23
  # clone / pull the lmeh eval data
24
+ H4_TOKEN = get_config("H4_TOKEN", None)
25
 
26
+ LEADERBOARD_NAME = get_config("LEADERBOARD_NAME", "Open LLM Leaderboard")
27
 
28
+ REPO_ID = get_config("REPO_ID", "HuggingFaceH4/open_llm_leaderboard")
29
+ QUEUE_REPO = get_config("QUEUE_REPO", "open-llm-leaderboard/requests")
30
+ DYNAMIC_INFO_REPO = get_config("DYNAMIC_INFO_REPO", "open-llm-leaderboard/dynamic_model_information")
31
+ RESULTS_REPO = get_config("RESULTS_REPO", "open-llm-leaderboard/results")
32
+ RAW_RESULTS_REPO = get_config("RAW_RESgit sULTS_REPO", None)
33
 
34
  PRIVATE_QUEUE_REPO = QUEUE_REPO
35
  PRIVATE_RESULTS_REPO = RESULTS_REPO
36
  #PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
37
  #PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
38
 
39
+ IS_PUBLIC = bool(get_config("IS_PUBLIC", True))
40
 
41
+ CACHE_PATH=get_config("HF_HOME", ".")
42
 
43
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
44
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
48
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
49
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
50
 
51
+ PATH_TO_COLLECTION = get_config("PATH_TO_COLLECTION", "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03")
52
 
53
  # Rate limit variables
54
+ RATE_LIMIT_PERIOD = int(get_config("RATE_LIMIT_PERIOD", 7))
55
+ RATE_LIMIT_QUOTA = int(get_config("RATE_LIMIT_QUOTA", 5))
56
+ HAS_HIGHER_RATE_LIMIT = get_config("HAS_HIGHER_RATE_LIMIT", "TheBloke").split(',')
57
 
58
+ TRUST_REMOTE_CODE = bool(get_config("TRUST_REMOTE_CODE", False))
59
 
60
  #Set if you want to get an extra field with the average eval results from the HF leaderboard
61
+ GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(get_config("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
62
+ ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
63
  ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
64
 
65
  API = HfApi(token=H4_TOKEN)
src/tools/plots.py CHANGED
@@ -99,7 +99,7 @@ def create_metric_plot_obj(
99
  df = df[df["task"].isin(metrics)]
100
 
101
  # Filter the human baselines based on the specified metrics
102
- filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
103
 
104
  # Create a line figure using plotly express with specified markers and custom data
105
  fig = px.line(
 
99
  df = df[df["task"].isin(metrics)]
100
 
101
  # Filter the human baselines based on the specified metrics
102
+ filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}
103
 
104
  # Create a line figure using plotly express with specified markers and custom data
105
  fig = px.line(
tasks_config.yaml → tasks_config/legal_config.yaml RENAMED
File without changes
tasks_config/pt_config.yaml ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0.0
2
+ config:
3
+ REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
4
+ QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
5
+ RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
6
+ RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
7
+ DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
8
+ PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
9
+ IS_PUBLIC: true
10
+ LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
11
+ GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
12
+ TRUST_REMOTE_CODE: true
13
+ tasks:
14
+ enem_challenge:
15
+ benchmark: enem_challenge
16
+ col_name: ENEM
17
+ task_list:
18
+ - enem_challenge
19
+ metric: acc
20
+ few_shot: 3
21
+ limit: null
22
+ baseline: 20.0 #random baseline
23
+ #https://www.sejalguem.com/enem
24
+ #https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
25
+ human_baseline: 35.0 # ~60 / 180 acertos - nota ~500
26
+ expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
27
+ description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
28
+ level exam widely applied every year by the Brazilian government to students that
29
+ wish to undertake a University degree. This dataset contains 1,430 questions that don't require
30
+ image understanding of the exams from 2010 to 2018, 2022 and 2023."
31
+ link: https://huggingface.co/datasets/eduagarcia/enem_challenge
32
+ sources: ["https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
33
+ baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
34
+ bluex:
35
+ benchmark: bluex
36
+ col_name: BLUEX
37
+ task_list:
38
+ - bluex
39
+ metric: acc
40
+ few_shot: 3
41
+ limit: null
42
+ baseline: 22.5 #random baseline
43
+ #https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99
44
+ #https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4% - ~77% @ top-.99
45
+ human_baseline: 50.0
46
+ expert_human_baseline: 82.5
47
+ description: "BLUEX is a multimodal dataset consisting of the two leading
48
+ university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP),
49
+ spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"
50
+ link: https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images
51
+ sources: ["https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
52
+ baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
53
+ oab_exams:
54
+ benchmark: oab_exams
55
+ col_name: OAB Exams
56
+ task_list:
57
+ - oab_exams
58
+ metric: acc
59
+ few_shot: 3
60
+ limit: null
61
+ baseline: 25.0 #random baseline
62
+ #https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
63
+ # http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
64
+ # Acertou +70% = 17214 / 638500 = top-97,5%
65
+ # desvio top-97,5% -> 46 - 70.0% = 24
66
+ # z score 97,5% ~ 1,9675
67
+ # desvio padrao estimado -> 12,2
68
+ # top 99% = 46 + 2,33*12,2 = ~75.0
69
+ human_baseline: 46.0
70
+ expert_human_baseline: 75.0
71
+ description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
72
+ Association's exams, from 2010 to 2018.
73
+ link: https://huggingface.co/datasets/eduagarcia/oab_exams
74
+ sources: ["https://github.com/legal-nlp/oab-exams"]
75
+ baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
76
+ assin2_rte:
77
+ benchmark: assin2_rte
78
+ col_name: ASSIN2 RTE
79
+ task_list:
80
+ - assin2_rte
81
+ metric: f1_macro
82
+ few_shot: 15
83
+ limit: null
84
+ baseline: 50.0 #random baseline
85
+ human_baseline: null
86
+ expert_human_baseline: null
87
+ description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual -
88
+ Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN,
89
+ an evaluation shared task in the scope of the computational processing
90
+ of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language
91
+ Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
92
+ other text (hypothesis)."
93
+ link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
94
+ sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
95
+ assin2_sts:
96
+ benchmark: assin2_sts
97
+ col_name: ASSIN2 STS
98
+ task_list:
99
+ - assin2_sts
100
+ metric: pearson
101
+ few_shot: 15
102
+ limit: null
103
+ baseline: 0.0 #random baseline
104
+ human_baseline: null
105
+ expert_human_baseline: null
106
+ description: "Same as dataset as above. Semantic Textual Similarity (STS)
107
+ ‘measures the degree of semantic equivalence between two sentences’."
108
+ link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
109
+ sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
110
+ faquad_nli:
111
+ benchmark: faquad_nli
112
+ col_name: FAQUAD NLI
113
+ task_list:
114
+ - faquad_nli
115
+ metric: f1_macro
116
+ few_shot: 15
117
+ limit: null
118
+ baseline: 45.6 #random baseline
119
+ human_baseline: null
120
+ expert_human_baseline: null
121
+ description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the
122
+ Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of
123
+ abundant questions sent by academics whose answers are found in available institutional
124
+ documents in the Brazilian higher education system. It consists of 900 questions about
125
+ 249 reading passages taken from 18 official documents of a computer science college
126
+ from a Brazilian federal university and 21 Wikipedia articles related to the
127
+ Brazilian higher education system. FaQuAD-NLI is a modified version of the
128
+ FaQuAD dataset that repurposes the question answering task as a textual
129
+ entailment task between a question and its possible answers."
130
+ link: https://huggingface.co/datasets/ruanchaves/faquad-nli
131
+ sources: ["https://github.com/liafacom/faquad/"]
132
+ sparrow_pt:
133
+ benchmark: sparrow_pt
134
+ col_name: Sparrow POR
135
+ task_list:
136
+ - sparrow_emotion-2021-cortiz-por
137
+ - sparrow_hate-2019-fortuna-por
138
+ - sparrow_sentiment-2016-mozetic-por
139
+ - sparrow_sentiment-2018-brum-por
140
+ metric: f1_macro
141
+ few_shot: 15
142
+ limit: 500
143
+ baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
144
+ human_baseline: null
145
+ expert_human_baseline: null
146
+ description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding.
147
+ SPARROW comprises 169 datasets encompassing 64 different languages,
148
+ this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
149
+ One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021)
150
+ and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
151
+ All were extracted and manually annotated from Twitter/X."
152
+ link: https://huggingface.co/datasets/UBC-NLP/sparrow
153
+ sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]