Spaces:
Running
Running
mj-new
commited on
Commit
•
37d493c
1
Parent(s):
ceb2b55
Updated leaderboard code and requirements
Browse files- app.py +167 -52
- constants.py +2 -1
- requirements.txt +1 -1
- utils.py +87 -15
app.py
CHANGED
@@ -2,10 +2,12 @@ import os
|
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
|
5 |
-
from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
|
6 |
from app_utils import calculate_height_to_display, filter_dataframe
|
7 |
import matplotlib.pyplot as plt
|
8 |
import numpy as np
|
|
|
|
|
9 |
|
10 |
hf_token = os.getenv('HF_TOKEN')
|
11 |
if hf_token is None:
|
@@ -185,7 +187,7 @@ def create_radar_plot(df, enable_labels, systems, metric, norm_type, ref_type='o
|
|
185 |
st.pyplot(fig)
|
186 |
|
187 |
with about:
|
188 |
-
st.title("
|
189 |
st.markdown(ABOUT_INFO, unsafe_allow_html=True)
|
190 |
|
191 |
# Table - evaluated systems # TODO - change to concatenated table
|
@@ -196,6 +198,13 @@ with about:
|
|
196 |
#print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
|
197 |
|
198 |
df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
|
200 |
#print(codename_to_shortname_mapping)
|
201 |
|
@@ -203,14 +212,32 @@ with about:
|
|
203 |
|
204 |
df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
|
205 |
df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
|
206 |
-
st.
|
207 |
|
208 |
st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
|
209 |
|
210 |
-
st.header("Detalied info about evaluated ASR systems")
|
211 |
-
|
212 |
#TODO - add info who created the system (company, institution, team, etc.)
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
# Table - evaluation datasets
|
216 |
# Table - evaluation metrics
|
@@ -223,6 +250,8 @@ with about:
|
|
223 |
# List - TODOs
|
224 |
|
225 |
with lead_bigos:
|
|
|
|
|
226 |
|
227 |
# configuration for tab
|
228 |
dataset = "amu-cai/pl-asr-bigos-v2-secret"
|
@@ -257,17 +286,17 @@ with lead_bigos:
|
|
257 |
# save sample to tsv
|
258 |
df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
|
259 |
|
|
|
|
|
|
|
260 |
# MOST IMPORTANT RESULTS
|
261 |
analysis_dim = "system"
|
262 |
metric = "WER"
|
263 |
-
st.subheader("
|
264 |
-
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
|
265 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
266 |
|
267 |
-
|
268 |
-
########### EVALUATION PARAMETERS PRESENTATION ################
|
269 |
-
st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
|
270 |
-
st.markdown(BIGOS_INFO, unsafe_allow_html=True)
|
271 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
272 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
273 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
@@ -301,7 +330,6 @@ with lead_bigos:
|
|
301 |
h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
|
302 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
303 |
|
304 |
-
|
305 |
##################### PER SUBSET ANALYSIS #########################
|
306 |
analysis_dim = "subset"
|
307 |
metric = "WER"
|
@@ -311,7 +339,7 @@ with lead_bigos:
|
|
311 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
312 |
|
313 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
314 |
-
fig =
|
315 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
316 |
|
317 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
@@ -395,16 +423,14 @@ with lead_pelcra:
|
|
395 |
|
396 |
df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
|
397 |
|
398 |
-
|
399 |
analysis_dim = "system"
|
400 |
metric = "WER"
|
401 |
-
st.subheader("
|
402 |
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
|
403 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
404 |
-
|
405 |
-
|
406 |
-
st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
|
407 |
-
st.markdown(BIGOS_INFO, unsafe_allow_html=True)
|
408 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
409 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
410 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
@@ -447,7 +473,7 @@ with lead_pelcra:
|
|
447 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
448 |
|
449 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
450 |
-
fig =
|
451 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
452 |
|
453 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
@@ -502,6 +528,13 @@ with analysis:
|
|
502 |
|
503 |
dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_scenarios")
|
504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
# read the latest results for the selected dataset
|
506 |
print("Reading the latest results for dataset: ", dataset)
|
507 |
df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
|
@@ -547,7 +580,7 @@ with analysis:
|
|
547 |
st.subheader("Best and worst systems for dataset {}".format(dataset))
|
548 |
df_best_worse_systems = pd.DataFrame(data, columns=header)
|
549 |
# do not display index
|
550 |
-
st.dataframe(df_best_worse_systems)
|
551 |
|
552 |
st.subheader("Comparison of average WER for best systems")
|
553 |
df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
|
@@ -602,21 +635,74 @@ with analysis:
|
|
602 |
# Y is thw average WER
|
603 |
# make each point a different color
|
604 |
# provide legend with system names
|
605 |
-
fig, ax = plt.subplots()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
for system in free_systems_wer['system'].unique():
|
607 |
subset = free_systems_wer[free_systems_wer['system'] == system]
|
608 |
-
|
609 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
610 |
for i, point in subset.iterrows():
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
618 |
st.pyplot(fig)
|
619 |
|
|
|
620 |
##################################################################################################################################################
|
621 |
# WER per audio duration
|
622 |
|
@@ -653,11 +739,7 @@ with analysis:
|
|
653 |
# print dataframe in streamlit
|
654 |
st.dataframe(df_per_sample_wer_audio_pivot)
|
655 |
|
656 |
-
#
|
657 |
-
# each system should have a different color
|
658 |
-
# the size of the point should be proportional to the number of samples in the bucket
|
659 |
-
# the x axis should be the audio duration bucket
|
660 |
-
# the y axis should be the average WER
|
661 |
fig, ax = plt.subplots()
|
662 |
for system in selected_systems:
|
663 |
subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
|
@@ -678,7 +760,7 @@ with analysis:
|
|
678 |
audio_feature_to_analyze = 'speech_rate_words'
|
679 |
audio_feature_unit = ' [words/s]'
|
680 |
metric = 'WER'
|
681 |
-
metric_unit = '
|
682 |
no_of_buckets = 10
|
683 |
# calculate average WER per audio duration bucket for the best and worse commercial and free systems
|
684 |
selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
|
@@ -688,24 +770,57 @@ with analysis:
|
|
688 |
# print dataframe in streamlit
|
689 |
st.dataframe(df_per_sample_wer_feature_pivot)
|
690 |
|
691 |
-
#
|
692 |
-
|
693 |
-
# the size of the point should be proportional to the number of samples in the bucket
|
694 |
-
# the x axis should be the audio duration bucket
|
695 |
-
# the y axis should be the average WER
|
696 |
-
fig, ax = plt.subplots()
|
697 |
-
for system in selected_systems:
|
698 |
-
subset = df_per_sample_wer_feature[df_per_sample_wer_feature['system'] == system]
|
699 |
-
ax.scatter(subset[audio_feature_to_analyze], subset[metric], label=system, s=subset['number_of_samples']*0.5)
|
700 |
-
ax.set_xlabel(audio_feature_to_analyze.replace('_',' ').capitalize() + audio_feature_unit)
|
701 |
-
ax.set_ylabel(metric + metric_unit)
|
702 |
-
ax.set_title('WER in function of speech rate.'.format(audio_feature_to_analyze))
|
703 |
|
704 |
-
#
|
705 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
706 |
st.pyplot(fig)
|
707 |
|
708 |
|
|
|
709 |
################################################################################################################################################
|
710 |
# WER PER GENDER
|
711 |
|
|
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
|
5 |
+
from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension,box_plot_per_dimension_subsets, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
|
6 |
from app_utils import calculate_height_to_display, filter_dataframe
|
7 |
import matplotlib.pyplot as plt
|
8 |
import numpy as np
|
9 |
+
import statsmodels.api as sm
|
10 |
+
import seaborn as sns
|
11 |
|
12 |
hf_token = os.getenv('HF_TOKEN')
|
13 |
if hf_token is None:
|
|
|
187 |
st.pyplot(fig)
|
188 |
|
189 |
with about:
|
190 |
+
st.title("AMU Polish ASR Leaderboard")
|
191 |
st.markdown(ABOUT_INFO, unsafe_allow_html=True)
|
192 |
|
193 |
# Table - evaluated systems # TODO - change to concatenated table
|
|
|
198 |
#print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
|
199 |
|
200 |
df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
|
201 |
+
# drop columns "Included in BIGOS benchmark"
|
202 |
+
df_evaluated_systems = df_evaluated_systems.drop(columns=["Included in BIGOS benchmark"])
|
203 |
+
# drop empty rows
|
204 |
+
df_evaluated_systems = df_evaluated_systems.dropna(how='all')
|
205 |
+
# drop empty columns
|
206 |
+
df_evaluated_systems = df_evaluated_systems.dropna(axis=1, how='all')
|
207 |
+
|
208 |
codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
|
209 |
#print(codename_to_shortname_mapping)
|
210 |
|
|
|
212 |
|
213 |
df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
|
214 |
df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
|
215 |
+
st.subheader("Evaluated systems:")
|
216 |
|
217 |
st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
|
218 |
|
|
|
|
|
219 |
#TODO - add info who created the system (company, institution, team, etc.)
|
220 |
+
# Split into separate tables for free and commercial systems
|
221 |
+
free_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'free']
|
222 |
+
commercial_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'commercial']
|
223 |
+
|
224 |
+
st.subheader("Free systems:")
|
225 |
+
# drop empty columns
|
226 |
+
free_systems = free_systems.dropna(axis=1, how='all')
|
227 |
+
# drop empty rows
|
228 |
+
free_systems = free_systems.dropna(how='all')
|
229 |
+
|
230 |
+
# do not display index
|
231 |
+
st.dataframe(free_systems, hide_index=True, height = h_df_systems, use_container_width=True)
|
232 |
+
|
233 |
+
st.subheader("Commercial systems:")
|
234 |
+
# drop empty columns
|
235 |
+
commercial_systems = commercial_systems.dropna(axis=1, how='all')
|
236 |
+
# do not display index
|
237 |
+
# drop empty rows
|
238 |
+
commercial_systems = commercial_systems.dropna(how='all')
|
239 |
+
|
240 |
+
st.dataframe(commercial_systems, hide_index=True, height = h_df_systems, use_container_width=True)
|
241 |
|
242 |
# Table - evaluation datasets
|
243 |
# Table - evaluation metrics
|
|
|
250 |
# List - TODOs
|
251 |
|
252 |
with lead_bigos:
|
253 |
+
st.title("BIGOS Leaderboard")
|
254 |
+
st.markdown(BIGOS_INFO, unsafe_allow_html=True)
|
255 |
|
256 |
# configuration for tab
|
257 |
dataset = "amu-cai/pl-asr-bigos-v2-secret"
|
|
|
286 |
# save sample to tsv
|
287 |
df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
|
288 |
|
289 |
+
########### EVALUATION PARAMETERS PRESENTATION ################
|
290 |
+
st.title("ASR leaderboard for dataset: {} {}".format(dataset_short_name, dataset_version))
|
291 |
+
|
292 |
# MOST IMPORTANT RESULTS
|
293 |
analysis_dim = "system"
|
294 |
metric = "WER"
|
295 |
+
st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
|
296 |
+
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim, metric + "[%]","System", "Type")
|
297 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
298 |
|
299 |
+
st.header("Benchmark details")
|
|
|
|
|
|
|
300 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
301 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
302 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
|
|
330 |
h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
|
331 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
332 |
|
|
|
333 |
##################### PER SUBSET ANALYSIS #########################
|
334 |
analysis_dim = "subset"
|
335 |
metric = "WER"
|
|
|
339 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
340 |
|
341 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
342 |
+
fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
|
343 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
344 |
|
345 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
|
|
423 |
|
424 |
df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
|
425 |
|
426 |
+
# MOST IMPORTANT RESULTS
|
427 |
analysis_dim = "system"
|
428 |
metric = "WER"
|
429 |
+
st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
|
430 |
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
|
431 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
432 |
+
|
433 |
+
st.header("Benchmark details")
|
|
|
|
|
434 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
435 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
436 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
|
|
473 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
474 |
|
475 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
476 |
+
fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
|
477 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
478 |
|
479 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
|
|
528 |
|
529 |
dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_scenarios")
|
530 |
|
531 |
+
if dataset == "amu-cai/pl-asr-bigos-v2-secret":
|
532 |
+
dataset_short_name = "BIGOS"
|
533 |
+
elif dataset == "pelcra/pl-asr-pelcra-for-bigos-secret":
|
534 |
+
dataset_short_name = "PELCRA"
|
535 |
+
else:
|
536 |
+
dataset_short_name = "UNKNOWN"
|
537 |
+
|
538 |
# read the latest results for the selected dataset
|
539 |
print("Reading the latest results for dataset: ", dataset)
|
540 |
df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
|
|
|
580 |
st.subheader("Best and worst systems for dataset {}".format(dataset))
|
581 |
df_best_worse_systems = pd.DataFrame(data, columns=header)
|
582 |
# do not display index
|
583 |
+
st.dataframe(df_best_worse_systems, hide_index=True)
|
584 |
|
585 |
st.subheader("Comparison of average WER for best systems")
|
586 |
df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
|
|
|
635 |
# Y is thw average WER
|
636 |
# make each point a different color
|
637 |
# provide legend with system names
|
638 |
+
fig, ax = plt.subplots(figsize=(10, 7))
|
639 |
+
|
640 |
+
# Define larger jitter for close points
|
641 |
+
jitter_x = 5
|
642 |
+
jitter_y = 0.2
|
643 |
+
|
644 |
+
# Alternate marker shapes to distinguish overlapping points
|
645 |
+
marker_styles = ['o', 's', 'D', '^', 'v', '<', '>'] # Circle, square, diamond, and other shapes
|
646 |
+
marker_dict = {system: marker_styles[i % len(marker_styles)] for i, system in enumerate(free_systems_wer['system'].unique())}
|
647 |
+
|
648 |
for system in free_systems_wer['system'].unique():
|
649 |
subset = free_systems_wer[free_systems_wer['system'] == system]
|
650 |
+
marker_style = marker_dict[system]
|
651 |
+
|
652 |
+
# Scatter plot with distinct marker shapes for each system
|
653 |
+
ax.scatter(
|
654 |
+
subset['Parameters [M]'] + jitter_x * (np.random.rand(len(subset)) - 0.5), # Apply jitter to x for overlap
|
655 |
+
subset['WER'] + jitter_y * (np.random.rand(len(subset)) - 0.5), # Apply jitter to y for overlap
|
656 |
+
label=system, s=100, alpha=0.7, edgecolor='black', marker=marker_style
|
657 |
+
)
|
658 |
+
|
659 |
+
# Add text annotations with dynamic positioning to avoid overlap with y-axis
|
660 |
for i, point in subset.iterrows():
|
661 |
+
# Adjust position to avoid overlap with y-axis
|
662 |
+
x_offset = 10 if point['Parameters [M]'] < 50 else -10 if i % 2 == 1 else 10 # Push right if close to y-axis
|
663 |
+
y_offset = -0.5 if i % 2 == 0 else 0.5 # Alternate vertical offset
|
664 |
+
|
665 |
+
ax.annotate(
|
666 |
+
point['system'],
|
667 |
+
(point['Parameters [M]'], point['WER']),
|
668 |
+
textcoords="offset points",
|
669 |
+
xytext=(x_offset, y_offset),
|
670 |
+
ha='right' if x_offset < 0 else 'left',
|
671 |
+
fontsize=10,
|
672 |
+
bbox=dict(boxstyle="round,pad=0.3", edgecolor='white', facecolor='white', alpha=0.7)
|
673 |
+
)
|
674 |
+
|
675 |
+
# Set axis labels and title
|
676 |
+
ax.set_xlabel('Model Size [M Parameters]', fontsize=12)
|
677 |
+
ax.set_ylabel('WER (%)', fontsize=12)
|
678 |
+
ax.set_title(f'WER vs. Model Size for Dataset {dataset_short_name}', fontsize=14, pad=20)
|
679 |
+
|
680 |
+
# Adjust legend settings to fit outside the main plot area
|
681 |
+
ax.legend(
|
682 |
+
title='System', bbox_to_anchor=(0.8, 1), loc='upper left',
|
683 |
+
fontsize=8, title_fontsize=9, frameon=True, shadow=False, facecolor='white')
|
684 |
+
#)
|
685 |
+
|
686 |
+
# Add grid lines and minor ticks for better readability
|
687 |
+
ax.grid(True, linestyle='--', alpha=0.5)
|
688 |
+
ax.minorticks_on()
|
689 |
+
ax.tick_params(which='both', direction='in', top=True, right=True)
|
690 |
+
|
691 |
+
|
692 |
+
# increase granularity of y-axis to 20 points per whole range
|
693 |
+
# Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
|
694 |
+
y_min = 0
|
695 |
+
y_max = ax.get_ylim()[1] # Get the current maximum y value
|
696 |
+
y_max_rounded = np.ceil(y_max / 5) * 5 # Round y_max up to the next highest multiple of 5
|
697 |
+
ax.set_ylim(y_min, y_max_rounded)
|
698 |
+
|
699 |
+
# Improve layout spacing
|
700 |
+
plt.tight_layout()
|
701 |
+
|
702 |
+
# Display the plot
|
703 |
st.pyplot(fig)
|
704 |
|
705 |
+
|
706 |
##################################################################################################################################################
|
707 |
# WER per audio duration
|
708 |
|
|
|
739 |
# print dataframe in streamlit
|
740 |
st.dataframe(df_per_sample_wer_audio_pivot)
|
741 |
|
742 |
+
# create scatter plot with WER in function of audio duration
|
|
|
|
|
|
|
|
|
743 |
fig, ax = plt.subplots()
|
744 |
for system in selected_systems:
|
745 |
subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
|
|
|
760 |
audio_feature_to_analyze = 'speech_rate_words'
|
761 |
audio_feature_unit = ' [words/s]'
|
762 |
metric = 'WER'
|
763 |
+
metric_unit = ' (%)'
|
764 |
no_of_buckets = 10
|
765 |
# calculate average WER per audio duration bucket for the best and worse commercial and free systems
|
766 |
selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
|
|
|
770 |
# print dataframe in streamlit
|
771 |
st.dataframe(df_per_sample_wer_feature_pivot)
|
772 |
|
773 |
+
# Set a threshold to remove outliers - here we use the 97th percentile of WER
|
774 |
+
threshold = df_per_sample_wer_feature[metric].quantile(0.97)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
775 |
|
776 |
+
# Remove data points with WER greater than the threshold
|
777 |
+
filtered_df = df_per_sample_wer_feature[df_per_sample_wer_feature[metric] <= threshold]
|
778 |
+
|
779 |
+
# Create figure and axis with larger size
|
780 |
+
fig, ax = plt.subplots(figsize=(10, 7))
|
781 |
+
|
782 |
+
# Scatter plot for each system
|
783 |
+
for system in selected_systems:
|
784 |
+
subset = filtered_df[filtered_df['system'] == system]
|
785 |
+
ax.scatter(subset[audio_feature_to_analyze],
|
786 |
+
subset[metric],
|
787 |
+
label=system,
|
788 |
+
s=subset['number_of_samples'] * 0.5,
|
789 |
+
alpha=0.6) # Set alpha for better visibility of overlapping points
|
790 |
+
|
791 |
+
# Adding a trend line using LOWESS
|
792 |
+
lowess = sm.nonparametric.lowess
|
793 |
+
trend = lowess(subset[metric], subset[audio_feature_to_analyze], frac=0.3) # Adjust frac to control smoothing
|
794 |
+
ax.plot(trend[:, 0], trend[:, 1], label=f'{system} Trend', linestyle='-', linewidth=2)
|
795 |
+
|
796 |
+
# Set axis labels with improved formatting for readability
|
797 |
+
ax.set_xlabel(audio_feature_to_analyze.replace('_', ' ').capitalize() + ' ' + audio_feature_unit )
|
798 |
+
ax.set_ylabel(metric + ' ' + metric_unit )
|
799 |
+
|
800 |
+
# Set an improved title that is more informative
|
801 |
+
ax.set_title('Word Error Rate (WER) vs Speech Rate\nBest Performing Free and Paid Systems', fontsize=14)
|
802 |
+
|
803 |
+
# increase granularity of y-axis to 20 points per whole range
|
804 |
+
# Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
|
805 |
+
y_min = 0
|
806 |
+
y_max = ax.get_ylim()[1] # Get the current maximum y value
|
807 |
+
y_max_rounded = np.ceil(y_max / 5) * 5 # Round y_max up to the next highest multiple of 5
|
808 |
+
ax.set_ylim(y_min, y_max_rounded)
|
809 |
+
|
810 |
+
# Add a grid to improve readability and alignment
|
811 |
+
ax.grid(True, linestyle='--', alpha=0.7)
|
812 |
+
|
813 |
+
# Place legend outside the plot area to prevent overlapping with data points
|
814 |
+
ax.legend(title='System', loc='upper right', bbox_to_anchor=(0.95, 1))
|
815 |
+
|
816 |
+
# Add tight layout to improve spacing between elements
|
817 |
+
fig.tight_layout()
|
818 |
+
|
819 |
+
# Display the plot
|
820 |
st.pyplot(fig)
|
821 |
|
822 |
|
823 |
+
|
824 |
################################################################################################################################################
|
825 |
# WER PER GENDER
|
826 |
|
constants.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
|
2 |
The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
|
|
|
3 |
To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
|
4 |
-
If you use this work, please
|
5 |
```@misc{amu_cai_pl_asr_leaderboard, \
|
6 |
author = {Michał Junczyk}, \
|
7 |
title = {{AMU Polish ASR Leaderboard}}, \
|
|
|
1 |
ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
|
2 |
The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
|
3 |
+
If you want to add your system or dataset to the leaderboard, please contact Michał Junczyk ([email protected]) or open a pull request on [GitHub](https://github.com/goodmike31/pl-asr-bigos-tools) <br>\
|
4 |
To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
|
5 |
+
If you use this work, please cite it as follows: <br> \
|
6 |
```@misc{amu_cai_pl_asr_leaderboard, \
|
7 |
author = {Michał Junczyk}, \
|
8 |
title = {{AMU Polish ASR Leaderboard}}, \
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
seaborn
|
2 |
-
|
|
|
1 |
seaborn
|
2 |
+
statsmodels
|
utils.py
CHANGED
@@ -9,22 +9,10 @@ from datasets import Dataset
|
|
9 |
from huggingface_hub import hf_hub_download
|
10 |
import matplotlib.patches as mpatches
|
11 |
import matplotlib as mpl
|
|
|
|
|
12 |
|
13 |
|
14 |
-
asr_systems_colors_mapping = {
|
15 |
-
'azure': '#1f77b4', # Blue
|
16 |
-
'google': '#2ca02c', # Green
|
17 |
-
'wav2vec2': '#d62728', # Red
|
18 |
-
'nemo': '#9467bd', # Purple
|
19 |
-
'assemblyai': '#8c564b', # Brown
|
20 |
-
'mms': '#e377c2', # Pink
|
21 |
-
'google_v2': '#7f7f7f', # Gray
|
22 |
-
'whisper_cloud': '#bcbd22', # Olive
|
23 |
-
'whisper_local': '#ff7f0e', # Orange
|
24 |
-
|
25 |
-
# Add or override other systems and their colors
|
26 |
-
}
|
27 |
-
|
28 |
def download_tsv_from_google_sheet(sheet_url):
|
29 |
# Modify the Google Sheet URL to export it as TSV
|
30 |
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
|
@@ -181,7 +169,7 @@ def filter_bottom_outliers(df_input, metric, min_threshold):
|
|
181 |
|
182 |
def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
|
183 |
# Box plot for WER per dataset
|
184 |
-
fig, ax = plt.subplots(figsize=(
|
185 |
|
186 |
# generate box plot without outliers
|
187 |
sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
|
@@ -193,6 +181,90 @@ def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
|
|
193 |
#return figure
|
194 |
return plt
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
|
197 |
# Create a figure and axis object
|
198 |
fig, ax = plt.subplots(figsize=(12, 8))
|
|
|
9 |
from huggingface_hub import hf_hub_download
|
10 |
import matplotlib.patches as mpatches
|
11 |
import matplotlib as mpl
|
12 |
+
from constants import asr_systems_colors_mapping
|
13 |
+
from matplotlib.lines import Line2D
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def download_tsv_from_google_sheet(sheet_url):
|
17 |
# Modify the Google Sheet URL to export it as TSV
|
18 |
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
|
|
|
169 |
|
170 |
def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
|
171 |
# Box plot for WER per dataset
|
172 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
173 |
|
174 |
# generate box plot without outliers
|
175 |
sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
|
|
|
181 |
#return figure
|
182 |
return plt
|
183 |
|
184 |
+
def box_plot_per_dimension_subsets(df_input, metric, dimension, title, xlabel, ylabel, category_column, y_limit=100):
|
185 |
+
"""
|
186 |
+
Plots a box plot with individual data points colored and marked by a specified category.
|
187 |
+
|
188 |
+
Parameters:
|
189 |
+
- df_input (pd.DataFrame): Input DataFrame containing data to plot.
|
190 |
+
- metric (str): Column name for the metric to plot on the y-axis.
|
191 |
+
- dimension (str): Column name for the dimension (x-axis categories).
|
192 |
+
- title (str): Title of the plot.
|
193 |
+
- xlabel (str): Label for the x-axis.
|
194 |
+
- ylabel (str): Label for the y-axis.
|
195 |
+
- category_column (str): Column name to use for differentiating data points by color and marker.
|
196 |
+
- y_limit (float, optional): Maximum value for the y-axis to limit extreme outliers.
|
197 |
+
|
198 |
+
Returns:
|
199 |
+
- fig: The matplotlib figure object.
|
200 |
+
"""
|
201 |
+
|
202 |
+
# Set up the figure and axis with a larger size for readability
|
203 |
+
fig, ax = plt.subplots(figsize=(14, 8))
|
204 |
+
|
205 |
+
# Create a sorted order for the dimension based on the median values of the metric
|
206 |
+
order = df_input.groupby(dimension)[metric].median().sort_values().index
|
207 |
+
|
208 |
+
# Generate box plot without showing extreme outliers
|
209 |
+
boxplot = sns.boxplot(
|
210 |
+
x=dimension, y=metric, data=df_input,
|
211 |
+
order=order, showfliers=False, width=0.6, ax=ax,
|
212 |
+
color="white"
|
213 |
+
)
|
214 |
+
|
215 |
+
# Make the box plots transparent by adjusting the facecolor of each box
|
216 |
+
for patch in boxplot.artists:
|
217 |
+
patch.set_facecolor("white")
|
218 |
+
patch.set_alpha(0.2) # Set transparency
|
219 |
+
|
220 |
+
# Define category-specific colors and marker styles
|
221 |
+
categories = df_input[category_column].unique()
|
222 |
+
markers = ['o', 's', '^', 'D', 'X', 'P', '*'] # Different marker styles
|
223 |
+
colors = sns.color_palette("Set2", len(categories)) # Use a color palette with distinct colors
|
224 |
+
category_style_map = {category: {'color': colors[i % len(colors)], 'marker': markers[i % len(markers)]}
|
225 |
+
for i, category in enumerate(categories)}
|
226 |
+
|
227 |
+
# Overlay individual data points with category-specific colors and markers
|
228 |
+
for category, style in category_style_map.items():
|
229 |
+
# Filter data for each category
|
230 |
+
category_data = df_input[(df_input[category_column] == category) & (df_input[metric] <= y_limit)]
|
231 |
+
sns.stripplot(
|
232 |
+
x=dimension, y=metric, data=category_data,
|
233 |
+
order=order, color=style['color'], marker=style['marker'],
|
234 |
+
size=5, jitter=True, alpha=1, ax=ax
|
235 |
+
)
|
236 |
+
|
237 |
+
# Set title and axis labels
|
238 |
+
ax.set_title(title)
|
239 |
+
ax.set_xlabel(xlabel)
|
240 |
+
ax.set_ylabel(ylabel)
|
241 |
+
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
|
242 |
+
|
243 |
+
# Add gridlines for easier comparison
|
244 |
+
plt.grid(axis='y', linestyle='--', alpha=0.5)
|
245 |
+
|
246 |
+
# Set y-axis limit to improve readability
|
247 |
+
# Calculate the y-axis maximum as the next multiple of 5 above the data’s max value
|
248 |
+
# Make sure the max value does not contain any extreme outliers. Threhold at 98th percentile
|
249 |
+
max_value = df_input[metric].quantile(0.99)
|
250 |
+
|
251 |
+
y_max = (int(max_value / 5) + 1) * 5
|
252 |
+
|
253 |
+
# Set y-axis ticks with evenly spaced intervals of 5
|
254 |
+
ax.set_yticks(range(0, y_max + 1, 5))
|
255 |
+
ax.set_ylim(0, y_max)
|
256 |
+
|
257 |
+
# Create a custom legend with unique entries for each category
|
258 |
+
legend_handles = [
|
259 |
+
Line2D([0], [0], marker=style['marker'], color='w', markerfacecolor=style['color'], markersize=8, label=category)
|
260 |
+
for category, style in category_style_map.items()
|
261 |
+
]
|
262 |
+
ax.legend(handles=legend_handles, title=category_column, bbox_to_anchor=(1.05, 1), loc='upper left')
|
263 |
+
|
264 |
+
# Return the updated figure
|
265 |
+
return fig
|
266 |
+
|
267 |
+
|
268 |
def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
|
269 |
# Create a figure and axis object
|
270 |
fig, ax = plt.subplots(figsize=(12, 8))
|