Spaces:
Runtime error
Runtime error
File size: 8,487 Bytes
b73474b b757aa2 b73474b 011d38f b73474b affbcab b73474b b757aa2 87e2fb2 affbcab b73474b affbcab b73474b affbcab b73474b affbcab b73474b affbcab b73474b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import gradio as gr
import pandas as pd
title = """
# hmLeaderboard: Space for tracking and ranking models on Historical NER Datasets
![hmLeaderboard](https://huggingface.co/spaces/hmbench/hmLeaderboard/resolve/main/logo.png)
"""
description = """
## Models
At the moment the following models are supported:
* hmBERT: [Historical Multilingual Language Models for Named Entity Recognition](https://huggingface.co/hmbert).
* hmTEAMS: [Historical Multilingual TEAMS Models](https://huggingface.co/hmteams).
* hmByT5: [Historical Multilingual and Monolingual ByT5 Models](https://huggingface.co/hmbyt5)
## Datasets
We test our pretrained language models on various datasets from HIPE-2020, HIPE-2022 and Europeana. The following table
shows an overview of used datasets.
| Language | Datasets |
|----------|------------------------------------------------------------------|
| English | [AjMC] - [TopRes19th] |
| German | [AjMC] - [NewsEye] - [HIPE-2020] |
| French | [AjMC] - [ICDAR-Europeana] - [LeTemps] - [NewsEye] - [HIPE-2020] |
| Finnish | [NewsEye] |
| Swedish | [NewsEye] |
| Dutch | [ICDAR-Europeana] |
[AjMC]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-ajmc.md
[NewsEye]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-newseye.md
[TopRes19th]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-topres19th.md
[ICDAR-Europeana]: https://github.com/stefan-it/historic-domain-adaptation-icdar
[LeTemps]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-letemps.md
[HIPE-2020]: https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-hipe2020.md
## Results
"""
footer = "Made from Bavarian Oberland with ❤️ and 🥨."
model_selection_file_names = {
"Best Configuration": "best_model_configurations.csv",
"Best Model": "best_models.csv"
}
df_init = pd.read_csv(model_selection_file_names["Best Configuration"])
dataset_names = df_init.columns.values[1:].tolist()
languages = list(set([dataset_name.split(" ")[0] for dataset_name in dataset_names]))
def perform_evaluation_for_datasets(model_selection, selected_datasets):
df = pd.read_csv(model_selection_file_names.get(model_selection))
selected_indices = []
for selected_dataset in selected_datasets:
selected_indices.append(dataset_names.index(selected_dataset) + 1)
mean_column = df.iloc[:, selected_indices].mean(axis=1).round(2)
# Include column with column name
result_df = df.iloc[:, [0] + selected_indices]
result_df["Average"] = mean_column
return result_df
def perform_evaluation_for_languages(model_selection, selected_languages):
df = pd.read_csv(model_selection_file_names.get(model_selection))
selected_indices = []
for selected_language in selected_languages:
selected_language = selected_language.lower()
found_indices = [i for i, column_name in enumerate(df.columns) if selected_language in column_name.lower()]
for found_index in found_indices:
selected_indices.append(found_index)
mean_column = df.iloc[:, selected_indices].mean(axis=1).round(2)
# Include column with column name
result_df = df.iloc[:, [0] + selected_indices]
result_df["Average"] = mean_column
return result_df
dataset_to_description_mapping = {
"AjMC": "#### AjMC\nThe AjMC dataset consists of NE-annotated historical commentaries in the field of Classics, and was created in the context of the [Ajax MultiCommentary](https://mromanello.github.io/ajax-multi-commentary/) project.\n\nThe following NEs were annotated: `pers`, `work`, `loc`, `object`, `date` and `scope`.",
"NewsEye": "#### NewsEye\nThe NewsEye dataset is comprised of diachronic historical newspaper material published between 1850 and 1950 in French, German, Finnish, and Swedish. More information can be found [here](https://dl.acm.org/doi/abs/10.1145/3404835.3463255).\n\nThe following NEs were annotated: `PER`, `LOC`, `ORG` and `HumanProd`.",
"ICDAR": "#### ICDAR\nThe ICDAR-Europeana NER Dataset is a preprocessed variant of the [Europeana NER Corpora](https://github.com/EuropeanaNewspapers/ner-corpora) for Dutch and French.\n\nThe following NEs were annotated: `PER`, `LOC` and `ORG`.",
"LeTemps": "#### LeTemps\nThe LeTemps dataset consists of NE-annotated historical French newspaper articles from mid-19C to mid 20C.\n\nThe following NEs were annotated: `loc`, `org` and `pers`.",
"TopRes19th": "#### TopRes19th\nThe TopRes19th dataset consists of NE-annotated historical English newspaper articles from 19C.\n\nThe following NEs were annotated: `BUILDING`, `LOC` and `STREET`.",
"HIPE-2020": "#### HIPE-2020\nThe HIPE-2020 dataset is comprised of newspapers from mid 19C to mid 20C. For information can be found [here](https://dl.acm.org/doi/abs/10.1007/978-3-030-58219-7_21).\n\nThe following NEs were annotated: `loc`, `org`, `pers`, `prod`, `time` and `comp`.",
}
configuration_to_description_mapping = {
"Best Configuration": "The best hyper-parameter configuration for each model is used and average F1-score over runs with different seeds is reported here:",
"Best Model": "The best hyper-parameter configuration for each model is used, the model with highest F1-score is chosen and its performance is reported here:"
}
with gr.Blocks() as demo:
gr.Markdown(title)
gr.Markdown(description)
with gr.Tab("Overview"):
gr.Markdown("### Best Configuration")
gr.Markdown(configuration_to_description_mapping["Best Configuration"])
df_result = perform_evaluation_for_datasets("Best Configuration", dataset_names)
gr.Dataframe(value=df_result)
gr.Markdown("### Best Model")
gr.Markdown(configuration_to_description_mapping["Best Model"])
df_result = perform_evaluation_for_datasets("Best Model", dataset_names)
gr.Dataframe(value=df_result)
for dataset_name, dataset_description in dataset_to_description_mapping.items():
with gr.Tab(dataset_name):
selected_datasets = [ds for ds in dataset_names if dataset_name.lower() in ds.lower()]
gr.Markdown(dataset_description)
for config in ["Best Configuration", "Best Model"]:
gr.Markdown(f"##### Results for {config}")
gr.Markdown(configuration_to_description_mapping[config])
df_result = perform_evaluation_for_datasets(config, selected_datasets)
gr.Dataframe(value=df_result)
with gr.Tab("Filtering"):
gr.Markdown("### Filtering\nSwiss-knife filtering for single datasets and languages is possible.")
model_selection = gr.Radio(choices=["Best Configuration", "Best Model"],
label="Model Selection",
info="Defines if best configuration or best model should be used for evaluation. When 'Best Configuration' is used, the best hyper-parameter configuration is used and then averaged F1-score over all runs is calculated. When 'Best Model' is chosen, the best hyper-parameter configuration and model with highest F1-score on development dataset is used (best model).",
value="Best Configuration")
with gr.Tab("Dataset Selection"):
datasets_selection = gr.CheckboxGroup(
dataset_names, label="Datasets", info="Select datasets for evaluation"
)
output_df = gr.Dataframe()
evaluation_button = gr.Button("Evaluate")
evaluation_button.click(fn=perform_evaluation_for_datasets, inputs=[model_selection, datasets_selection], outputs=output_df)
with gr.Tab("Language Selection"):
language_selection = gr.CheckboxGroup(
languages, label="Languages", info="Select languages for evaluation"
)
output_df = gr.Dataframe()
evaluation_button = gr.Button("Evaluate")
evaluation_button.click(fn=perform_evaluation_for_languages, inputs=[model_selection, language_selection], outputs=output_df)
gr.Markdown(footer)
demo.launch()
|