mj-new commited on
Commit
22f3279
β€’
1 Parent(s): 7d13965

new version with secret passed through hugging face repo

Browse files
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Pl Asr Leaderboard
3
- emoji: πŸ”₯
4
- colorFrom: gray
5
- colorTo: indigo
6
  sdk: streamlit
7
- sdk_version: 1.36.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
 
1
  ---
2
+ title: AMU ASR Leaderboard (PL)
3
+ emoji: πŸ“Š
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: streamlit
7
+ sdk_version: 1.32.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
__pycache__/app_utils.cpython-310.pyc ADDED
Binary file (2.39 kB). View file
 
__pycache__/constants.cpython-310.pyc ADDED
Binary file (1.21 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (9.92 kB). View file
 
app-backup.py ADDED
@@ -0,0 +1,1063 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import pandas as pd
4
+ from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO
5
+ from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
6
+ from app_utils import calculate_height_to_display, filter_dataframe
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ from datasets import load_dataset
10
+
11
+ hf_token = os.getenv('HF_TOKEN')
12
+ if hf_token is None:
13
+ raise ValueError("HF_TOKEN environment variable is not set. Please check your secrets settings.")
14
+
15
+ # Tabs
16
+ # About - Description, references, contact points
17
+ # Analysis and insights - questions and answers about the benchmark results
18
+ # Leaderboard - BIGOS
19
+ # Leaderboard - PELCRA
20
+ # TODO - add other tabs for other datasets e.g. Hallucinations, Children speech, etc.
21
+
22
+ st.set_page_config(layout="wide")
23
+
24
+ about, lead_bigos, lead_bigos_diagnostic, lead_bigos_synth, lead_pelcra, analysis, inspection = st.tabs(["About BIGOS benchmark", "AMU BIGOS-v2 leaderboard", "AMU BIGOS-diagnostic leaderboard", "AMU BIGOS-med leaderboard", "PELCRA4BIGOS leaderboard", "Analysis", "Data and results inspection"])
25
+
26
+ cols_to_select_all = ["system", "subset", "ref_type", "norm_type", "SER", "MER", "WER", "CER"]
27
+
28
+ def plot_performance(systems_to_plot, df_per_system_with_type):
29
+ # Get unique subsets
30
+ subsets = df_per_system_with_type['subset'].unique()
31
+
32
+ # Create a color and label map
33
+ color_label_map = {
34
+ free_system_with_best_wer: ('blue', 'Best Free'),
35
+ free_system_with_worst_wer: ('red', 'Worst Free'),
36
+ commercial_system_with_best_wer: ('green', 'Best Paid'),
37
+ commercial_system_with_worst_wer: ('orange', 'Worst Paid')
38
+ }
39
+
40
+ # Plot the data
41
+ fig, ax = plt.subplots(figsize=(14, 7))
42
+
43
+ bar_width = 0.3
44
+ index = np.arange(len(subsets))
45
+
46
+ for i, system in enumerate(systems_to_plot):
47
+ subset_wer = df_per_system_with_type[df_per_system_with_type['system'] == system].set_index('subset')['WER']
48
+ color, label = color_label_map[system]
49
+ ax.bar(index + i * bar_width, subset_wer.loc[subsets], bar_width, label=label + ' - ' + system, color=color)
50
+
51
+ # Adding labels and title
52
+ ax.set_xlabel('Subset')
53
+ ax.set_ylabel('WER (%)')
54
+ ax.set_title('Comparison of performance of ASR systems.')
55
+ ax.set_xticks(index + bar_width * 1.5)
56
+ ax.set_xticklabels(subsets, rotation=90, ha='right')
57
+ ax.legend()
58
+
59
+ st.pyplot(fig)
60
+
61
+ def round_to_nearest(value, multiple):
62
+ return multiple * round(value / multiple)
63
+
64
+ def create_bar_chart(df, systems, metric, norm_type, ref_type='orig', orientation='vertical'):
65
+ df = df[df['norm_type'] == norm_type]
66
+ df = df[df['ref_type'] == ref_type]
67
+
68
+ # Prepare the data for the bar chart
69
+ subsets = df['subset'].unique()
70
+ num_vars = len(subsets)
71
+ bar_width = 0.2 # Width of the bars
72
+
73
+ fig, ax = plt.subplots(figsize=(10, 10))
74
+
75
+ max_value_all_systems = 0
76
+ for i, system in enumerate(systems):
77
+ system_data = df[df['system'] == system]
78
+ max_value_for_system = max(system_data[metric])
79
+ if max_value_for_system > max_value_all_systems:
80
+ max_value_all_systems = round_to_nearest(max_value_for_system + 2, 10)
81
+
82
+ # Ensure the system data is in the same order as subsets
83
+ values = []
84
+ for subset in subsets:
85
+ subset_value = system_data[system_data['subset'] == subset][metric].values
86
+ if len(subset_value) > 0:
87
+ values.append(subset_value[0])
88
+ else:
89
+ values.append(0) # Append 0 if the subset value is missing
90
+
91
+ if orientation == 'vertical':
92
+ # Plot each system's bars with an offset for vertical orientation
93
+ x_pos = np.arange(len(subsets)) + i * bar_width
94
+ ax.bar(x_pos, values, bar_width, label=system)
95
+ # Add value labels
96
+ for j, value in enumerate(values):
97
+ ax.text(x_pos[j], value + max(values) * 0.03, f'{value}', ha='center', va='bottom',fontsize=6)
98
+ else:
99
+ # Plot each system's bars with an offset for horizontal orientation
100
+ y_pos = np.arange(len(subsets)) + i * bar_width
101
+ ax.barh(y_pos, values, bar_width, label=system)
102
+ # Add value labels
103
+ for j, value in enumerate(values):
104
+ ax.text(value + max(values) * 0.03, y_pos[j], f'{value}', ha='left', va='center', fontsize=6)
105
+
106
+ if orientation == 'vertical':
107
+ ax.set_xticks(np.arange(len(subsets)) + bar_width * (len(systems) - 1) / 2)
108
+ ax.set_xticklabels(subsets, rotation=45, ha='right')
109
+ ax.set_ylabel(metric)
110
+ else:
111
+ ax.set_yticks(np.arange(len(subsets)) + bar_width * (len(systems) - 1) / 2)
112
+ ax.set_yticklabels(subsets)
113
+ ax.set_xlabel(metric)
114
+
115
+ # Add grid values for the vertical and horizontal bar plots
116
+ if orientation == 'vertical':
117
+ ax.set_yticks(np.linspace(0, max_value_all_systems, 5))
118
+ else:
119
+ ax.set_xticks(np.linspace(0, max_value_all_systems, 5))
120
+
121
+ # Put legend on the right side outside of the plot
122
+ plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), shadow=True, ncol=1)
123
+
124
+ st.pyplot(fig)
125
+
126
+ def create_radar_plot(df, enable_labels, systems, metric, norm_type, ref_type='orig'):
127
+
128
+ df = df[df['norm_type'] == norm_type]
129
+ df = df[df['ref_type'] == ref_type]
130
+
131
+ # Prepare the data for the radar plot
132
+ #systems = df['system'].unique()
133
+ subsets = df['subset'].unique()
134
+ num_vars = len(subsets)
135
+
136
+ angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
137
+ angles += angles[:1] # Complete the loop
138
+
139
+ fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
140
+
141
+ max_value_all_systems = 0
142
+ for system in systems:
143
+ system_data = df[df['system'] == system]
144
+ max_value_for_system = max(system_data[metric])
145
+ if max_value_for_system > max_value_all_systems:
146
+ max_value_all_systems = round_to_nearest(max_value_for_system + 2, 10)
147
+
148
+ # Ensure the system data is in the same order as subsets
149
+ values = []
150
+ for subset in subsets:
151
+ subset_value = system_data[system_data['subset'] == subset][metric].values
152
+ if len(subset_value) > 0:
153
+ values.append(subset_value[0])
154
+ else:
155
+ values.append(0) # Append 0 if the subset value is missing
156
+
157
+ values += values[:1] # Complete the loop
158
+
159
+ # Plot each system
160
+ ax.plot(angles, values, label=system)
161
+ ax.fill(angles, values, alpha=0.25)
162
+
163
+ # Add value labels
164
+ for angle, value in zip(angles, values):
165
+ ax.text(angle, value + max(values) * 0.01, f'{value}', ha='center', va='center', fontsize=6)
166
+
167
+ ax.set_xticklabels(subsets)
168
+
169
+ ax.set_yticks(np.linspace(0, max_value_all_systems, 5))
170
+
171
+ # put legend at the bottom of the page
172
+ plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), shadow=True, ncol=1)
173
+
174
+ st.pyplot(fig)
175
+
176
+ with about:
177
+ st.title("About BIGOS benchmark")
178
+ st.markdown(ABOUT_INFO, unsafe_allow_html=True)
179
+ # TODO - load and display about BIGOS benchmark
180
+
181
+ # Table - evaluated systems # TODO - change to concatenated table
182
+ st.header("Evaluated ASR systems")
183
+ dataset = "amu-cai/pl-asr-bigos-v2-secret"
184
+ split = "test"
185
+ df_per_sample, df_per_dataset = read_latest_results(dataset, split, codename_to_shortname_mapping=None)
186
+ evaluated_systems_list = df_per_sample["system"].unique()
187
+ #print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
188
+
189
+ df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
190
+ codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
191
+ #print(codename_to_shortname_mapping)
192
+
193
+ h_df_systems = calculate_height_to_display(df_evaluated_systems)
194
+
195
+ df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
196
+ df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
197
+ st.write("Evaluated ASR systems types")
198
+
199
+ st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
200
+
201
+ st.write("Evaluated ASR systems details")
202
+
203
+ #TODO - add info who created the system (company, institution, team, etc.)
204
+ st.dataframe(df_evaluated_systems, hide_index=True, height = h_df_systems, use_container_width=True)
205
+
206
+ # Table - evaluation datasets
207
+ # Table - evaluation metrics
208
+ # Table - evaluation metadata
209
+ # List - references
210
+ # List - contact points
211
+ # List - acknowledgements
212
+ # List - changelog
213
+ # List - FAQ
214
+ # List - TODOs
215
+
216
+ with lead_bigos:
217
+
218
+ # configuration for tab
219
+ dataset = "amu-cai/pl-asr-bigos-v2-secret"
220
+ dataset_short_name = "BIGOS"
221
+ dataset_version = "V2"
222
+ eval_date = "March 2024"
223
+ split = "test"
224
+ norm_type = "all"
225
+ ref_type = "orig"
226
+
227
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
228
+ #### DATA LOADING AND AUGMENTATION ####
229
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
230
+
231
+
232
+ # filter only the ref_type and norm_type we want to analyze
233
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
234
+ # filter only the ref_type and norm_type we want to analyze
235
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
236
+
237
+ ##### PARAMETERS CALCULATION ####
238
+ evaluated_systems_list = df_per_sample["system"].unique()
239
+ no_of_evaluated_systems = len(evaluated_systems_list)
240
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
241
+ no_of_test_cases = len(df_per_sample)
242
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
243
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
244
+ no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
245
+
246
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
247
+
248
+ ########### EVALUATION PARAMETERS PRESENTATION ################
249
+ st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
250
+ st.markdown(BIGOS_INFO, unsafe_allow_html=True)
251
+ st.markdown("**Evaluation date:** {}".format(eval_date))
252
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
253
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
254
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
255
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
256
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
257
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
258
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
259
+ st.markdown("**Dataset:** {}".format(dataset))
260
+ st.markdown("**Dataset version:** {}".format(dataset_version))
261
+ st.markdown("**Split:** {}".format(split))
262
+ st.markdown("**Text reference type:** {}".format(ref_type))
263
+ st.markdown("**Normalization steps:** {}".format(norm_type))
264
+
265
+ ########### RESULTS ################
266
+ st.header("WER (Word Error Rate) analysis")
267
+ st.subheader("Average WER for the whole dataset")
268
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
269
+ st.dataframe(df_wer_avg)
270
+
271
+ st.subheader("Comparison of average WER for free and commercial systems")
272
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
273
+ st.dataframe(df_wer_avg_free_commercial)
274
+
275
+
276
+ ##################### PER SYSTEM ANALYSIS #########################
277
+ analysis_dim = "system"
278
+ metric = "WER"
279
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
280
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
281
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
282
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
283
+
284
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
285
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
286
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
287
+
288
+ ##################### PER SUBSET ANALYSIS #########################
289
+ analysis_dim = "subset"
290
+ metric = "WER"
291
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
292
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
293
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
294
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
295
+
296
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
297
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
298
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
299
+
300
+ ### IMPACT OF NORMALIZATION ON ERROR RATES #####
301
+ # Calculate the average impact of various norm_types for all datasets and systems
302
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
303
+ diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols)
304
+ st.subheader("Impact of normalization of references and hypothesis on evaluation metrics")
305
+ st.dataframe(diff_in_metrics, use_container_width=False)
306
+
307
+ # Visualizing the differences in metrics graphically with data labels
308
+ # Visualizing the differences in metrics graphically with data labels
309
+ fig, axs = plt.subplots(3, 2, figsize=(12, 12))
310
+ fig.subplots_adjust(hspace=0.6, wspace=0.6)
311
+
312
+ #remove the sixth subplot
313
+ fig.delaxes(axs[2,1])
314
+
315
+ metrics = ['SER', 'WER', 'MER', 'CER', "Average"]
316
+ colors = ['blue', 'orange', 'green', 'red', 'purple']
317
+
318
+ for ax, metric, color in zip(axs.flatten(), metrics, colors):
319
+ bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color)
320
+ ax.set_title(f'Normalization impact on {metric}')
321
+ if metric == 'Average':
322
+ ax.set_title('Average normalization impact on all metrics')
323
+ ax.set_xlabel('Normalization Type')
324
+ ax.set_ylabel(f'Difference in {metric}')
325
+ ax.grid(True)
326
+ ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
327
+ min_val = diff_in_metrics[metric].min()
328
+ ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1])
329
+
330
+ for bar in bars:
331
+ height = bar.get_height()
332
+ ax.annotate(f'{height:.2f}',
333
+ xy=(bar.get_x() + bar.get_width() / 2, height),
334
+ xytext=(0, -12), # 3 points vertical offset
335
+ textcoords="offset points",
336
+ ha='center', va='bottom')
337
+
338
+
339
+ # Display the plot in Streamlit
340
+ st.pyplot(fig)
341
+
342
+ ##################### APPENDIX #########################
343
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
344
+ # select only the columns we want to plot
345
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
346
+
347
+ with lead_bigos_diagnostic:
348
+
349
+ # configuration for tab
350
+ dataset = "amu-cai/pl-asr-bigos-v2-diagnostic"
351
+ dataset_short_name = "BIGOS DIAGNOSTIC"
352
+ dataset_version = "V2"
353
+ eval_date = "March 2024"
354
+ split = "test"
355
+ norm_type = "all"
356
+ ref_type = "orig"
357
+
358
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
359
+ #### DATA LOADING AND AUGMENTATION ####
360
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
361
+
362
+
363
+ # filter only the ref_type and norm_type we want to analyze
364
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
365
+ # filter only the ref_type and norm_type we want to analyze
366
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
367
+
368
+ ##### PARAMETERS CALCULATION ####
369
+ evaluated_systems_list = df_per_sample["system"].unique()
370
+ no_of_evaluated_systems = len(evaluated_systems_list)
371
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
372
+ no_of_test_cases = len(df_per_sample)
373
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
374
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
375
+ #no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
376
+ no_of_unique_speakers="N/A"
377
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
378
+
379
+ ########### EVALUATION PARAMETERS PRESENTATION ################
380
+ st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
381
+ st.markdown(BIGOS_INFO, unsafe_allow_html=True)
382
+ st.markdown("**Evaluation date:** {}".format(eval_date))
383
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
384
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
385
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
386
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
387
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
388
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
389
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
390
+ st.markdown("**Dataset:** {}".format(dataset))
391
+ st.markdown("**Dataset version:** {}".format(dataset_version))
392
+ st.markdown("**Split:** {}".format(split))
393
+ st.markdown("**Text reference type:** {}".format(ref_type))
394
+ st.markdown("**Normalization steps:** {}".format(norm_type))
395
+
396
+ ########### RESULTS ################
397
+ st.header("WER (Word Error Rate) analysis")
398
+ st.subheader("Average WER for the whole dataset")
399
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
400
+ st.dataframe(df_wer_avg)
401
+
402
+ st.subheader("Comparison of average WER for free and commercial systems")
403
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
404
+ st.dataframe(df_wer_avg_free_commercial)
405
+
406
+
407
+ ##################### PER SYSTEM ANALYSIS #########################
408
+ analysis_dim = "system"
409
+ metric = "WER"
410
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
411
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
412
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
413
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
414
+
415
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
416
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
417
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
418
+
419
+ ##################### PER SUBSET ANALYSIS #########################
420
+ analysis_dim = "subset"
421
+ metric = "WER"
422
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
423
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
424
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
425
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
426
+
427
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
428
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
429
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
430
+
431
+ ##################### APPENDIX #########################
432
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
433
+ # select only the columns we want to plot
434
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
435
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
436
+
437
+ with lead_bigos_synth:
438
+
439
+ # configuration for tab
440
+ dataset = "amu-cai/pl-asr-bigos-synth"
441
+ dataset_short_name = "BIGOS synthetic"
442
+ dataset_version = "V1"
443
+ eval_date = "March 2024"
444
+ split = "test"
445
+ norm_type = "all"
446
+ ref_type = "orig"
447
+
448
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
449
+ #### DATA LOADING AND AUGMENTATION ####
450
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
451
+
452
+ # filter only the ref_type and norm_type we want to analyze
453
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
454
+ # filter only the ref_type and norm_type we want to analyze
455
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
456
+
457
+ ##### PARAMETERS CALCULATION ####
458
+ evaluated_systems_list = df_per_sample["system"].unique()
459
+ no_of_evaluated_systems = len(evaluated_systems_list)
460
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
461
+ no_of_test_cases = len(df_per_sample)
462
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
463
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
464
+ #no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
465
+ no_of_unique_speakers="N/A"
466
+
467
+ df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
468
+
469
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
470
+
471
+ ########### EVALUATION PARAMETERS PRESENTATION ################
472
+ st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
473
+ st.markdown(BIGOS_INFO, unsafe_allow_html=True)
474
+ st.markdown("**Evaluation date:** {}".format(eval_date))
475
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
476
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
477
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
478
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
479
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
480
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
481
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
482
+ st.markdown("**Dataset:** {}".format(dataset))
483
+ st.markdown("**Dataset version:** {}".format(dataset_version))
484
+ st.markdown("**Split:** {}".format(split))
485
+ st.markdown("**Text reference type:** {}".format(ref_type))
486
+ st.markdown("**Normalization steps:** {}".format(norm_type))
487
+
488
+ ########### RESULTS ################
489
+ st.header("WER (Word Error Rate) analysis")
490
+ st.subheader("Average WER for the whole dataset")
491
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
492
+ st.dataframe(df_wer_avg)
493
+
494
+ st.subheader("Comparison of average WER for free and commercial systems")
495
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
496
+ st.dataframe(df_wer_avg_free_commercial)
497
+
498
+
499
+ ##################### PER SYSTEM ANALYSIS #########################
500
+ analysis_dim = "system"
501
+ metric = "WER"
502
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
503
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
504
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
505
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
506
+
507
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
508
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
509
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
510
+
511
+ ##################### PER SUBSET ANALYSIS #########################
512
+ analysis_dim = "subset"
513
+ metric = "WER"
514
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
515
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
516
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
517
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
518
+
519
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
520
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
521
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
522
+
523
+ ### IMPACT OF NORMALIZATION ON ERROR RATES #####
524
+ # Calculate the average impact of various norm_types for all datasets and systems
525
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
526
+ diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols)
527
+ st.subheader("Impact of normalization of references and hypothesis on evaluation metrics")
528
+ st.dataframe(diff_in_metrics, use_container_width=False)
529
+
530
+ ##################### APPENDIX #########################
531
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
532
+ # select only the columns we want to plot
533
+ df_per_dataset_selected_cols = df_per_dataset[cols_to_select_all]
534
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
535
+
536
+ with lead_pelcra:
537
+ st.title("PELCRA Leaderboard")
538
+ st.markdown(PELCRA_INFO, unsafe_allow_html=True)
539
+
540
+ # configuration for tab
541
+ dataset = "pelcra/pl-asr-pelcra-for-bigos-secret"
542
+ dataset_short_name = "PELCRA"
543
+ dataset_version = "V1"
544
+ eval_date = "March 2024"
545
+ split = "test"
546
+ norm_type = "all"
547
+ ref_type = "orig"
548
+
549
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
550
+ #### DATA LOADING AND AUGMENTATION ####
551
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
552
+
553
+
554
+ # filter only the ref_type and norm_type we want to analyze
555
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
556
+ # filter only the ref_type and norm_type we want to analyze
557
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
558
+
559
+ ##### PARAMETERS CALCULATION ####
560
+ evaluated_systems_list = df_per_sample["system"].unique()
561
+ no_of_evaluated_systems = len(evaluated_systems_list)
562
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
563
+ no_of_test_cases = len(df_per_sample)
564
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
565
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
566
+ no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
567
+
568
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
569
+
570
+ ########### EVALUATION PARAMETERS PRESENTATION ################
571
+ st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
572
+ st.markdown(BIGOS_INFO, unsafe_allow_html=True)
573
+ st.markdown("**Evaluation date:** {}".format(eval_date))
574
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
575
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
576
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
577
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
578
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
579
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
580
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
581
+ st.markdown("**Dataset:** {}".format(dataset))
582
+ st.markdown("**Dataset version:** {}".format(dataset_version))
583
+ st.markdown("**Split:** {}".format(split))
584
+ st.markdown("**Text reference type:** {}".format(ref_type))
585
+ st.markdown("**Normalization steps:** {}".format(norm_type))
586
+
587
+ ########### RESULTS ################
588
+ st.header("WER (Word Error Rate) analysis")
589
+ st.subheader("Average WER for the whole dataset")
590
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
591
+ st.dataframe(df_wer_avg)
592
+
593
+ st.subheader("Comparison of average WER for free and commercial systems")
594
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
595
+ st.dataframe(df_wer_avg_free_commercial)
596
+
597
+ ##################### PER SYSTEM ANALYSIS #########################
598
+ analysis_dim = "system"
599
+ metric = "WER"
600
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
601
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
602
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
603
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
604
+
605
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
606
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
607
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
608
+
609
+ ##################### PER SUBSET ANALYSIS #########################
610
+ analysis_dim = "subset"
611
+ metric = "WER"
612
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
613
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
614
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
615
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
616
+
617
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
618
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
619
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
620
+
621
+ ### IMPACT OF NORMALIZATION ON ERROR RATES #####
622
+ # Calculate the average impact of various norm_types for all datasets and systems
623
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
624
+ diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols)
625
+ st.subheader("Impact of normalization on WER")
626
+ st.dataframe(diff_in_metrics, use_container_width=False)
627
+
628
+ # Visualizing the differences in metrics graphically with data labels
629
+ # Visualizing the differences in metrics graphically with data labels
630
+ fig, axs = plt.subplots(3, 2, figsize=(12, 12))
631
+ fig.subplots_adjust(hspace=0.6, wspace=0.6)
632
+
633
+ #remove the sixth subplot
634
+ fig.delaxes(axs[2,1])
635
+
636
+ metrics = ['SER', 'WER', 'MER', 'CER', "Average"]
637
+ colors = ['blue', 'orange', 'green', 'red', 'purple']
638
+
639
+ for ax, metric, color in zip(axs.flatten(), metrics, colors):
640
+ bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color)
641
+ ax.set_title(f'Normalization impact on {metric}')
642
+ if metric == 'Average':
643
+ ax.set_title('Average normalization impact on all metrics')
644
+ ax.set_xlabel('Normalization Type')
645
+ ax.set_ylabel(f'Difference in {metric}')
646
+ ax.grid(True)
647
+ ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
648
+ min_val = diff_in_metrics[metric].min()
649
+ ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1])
650
+
651
+ for bar in bars:
652
+ height = bar.get_height()
653
+ ax.annotate(f'{height:.2f}',
654
+ xy=(bar.get_x() + bar.get_width() / 2, height),
655
+ xytext=(0, -12), # 3 points vertical offset
656
+ textcoords="offset points",
657
+ ha='center', va='bottom')
658
+
659
+ # Display the plot in Streamlit
660
+ st.pyplot(fig)
661
+
662
+ ##################### APPENDIX #########################
663
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
664
+ # select only the columns we want to plot
665
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
666
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
667
+
668
+ with analysis:
669
+
670
+ datasets = [
671
+ "amu-cai/pl-asr-bigos-v2-secret",
672
+ "pelcra/pl-asr-pelcra-for-bigos-secret",
673
+ "amu-cai/pl-asr-bigos-v2-diagnostic",
674
+ "amu-cai/pl-asr-bigos-v2-med"]
675
+
676
+
677
+ st.title("Analysis and insights")
678
+ st.markdown(ANALYSIS_INFO, unsafe_allow_html=True)
679
+
680
+ st.title("Plots for analyzing ASR Systems performance")
681
+
682
+ # select the dataset to display results
683
+ dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'))
684
+
685
+ # read the latest results for the selected dataset
686
+ print("Reading the latest results for dataset: ", dataset)
687
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
688
+ # filter only the ref_type and norm_type we want to analyze
689
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
690
+ # filter only the ref_type and norm_type we want to analyze
691
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
692
+
693
+ evaluated_systems_list = df_per_sample["system"].unique()
694
+ print(evaluated_systems_list)
695
+ df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
696
+ print(df_evaluated_systems)
697
+
698
+ # read available options to analyze for specific dataset
699
+ splits = list(df_per_dataset_all['subset'].unique()) # Get the unique splits
700
+ norm_types = list(df_per_dataset_all['norm_type'].unique()) # Get the unique norm_types
701
+ ref_types = list(df_per_dataset_all['ref_type'].unique()) # Get the unique ref_types
702
+ systems = list(df_per_dataset_all['system'].unique()) # Get the unique systems
703
+ metrics = list(df_per_dataset_all.columns[7:]) # Get the unique metrics
704
+
705
+ # Select the system to display. More than 1 system can be selected.
706
+ systems_selected = st.multiselect("Select ASR Systems", systems)
707
+
708
+ # Select the metric to display
709
+ metric = st.selectbox("Select Metric", metrics, index=metrics.index('WER'))
710
+
711
+ # Select the normalization type
712
+ norm_type = st.selectbox("Select Normalization Type", norm_types, index=norm_types.index('all'))
713
+ # Select the reference type
714
+ ref_type = st.selectbox("Select Reference Type", ref_types, index=ref_types.index('orig'))
715
+
716
+ enable_labels = st.checkbox("Enable labels on radar plot", value=True)
717
+
718
+ enable_bar_chart = st.checkbox("Enable bar chart", value=True)
719
+ enable_polar_plot = st.checkbox("Enable radar plot", value=True)
720
+
721
+ orientation = st.selectbox("Select orientation", ["vertical", "horizontal"], index=0)
722
+
723
+ if enable_polar_plot:
724
+ if metric:
725
+ if systems_selected:
726
+ create_radar_plot(df_per_dataset_all, enable_labels, systems_selected, metric, norm_type, ref_type)
727
+
728
+ if enable_bar_chart:
729
+ if metric:
730
+ if systems_selected:
731
+ create_bar_chart(df_per_dataset_all, systems_selected , metric, norm_type, ref_type, orientation)
732
+
733
+
734
+ ##### ANALYSIS - COMMERCIAL VS FREE SYSTEMS #####
735
+ # Generate dataframe with columns as follows System Type Subset Avg_WER
736
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
737
+
738
+ df_wer_avg_per_system_all_subsets_with_type = df_per_dataset_with_asr_systems_meta.groupby(['system', 'Type', 'subset'])['WER'].mean().reset_index()
739
+ print(df_wer_avg_per_system_all_subsets_with_type)
740
+
741
+ # Select the best and worse system for free and commercial systems
742
+ free_systems = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['Type'] == 'free']['system'].unique()
743
+ commercial_systems = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['Type'] == 'commercial']['system'].unique()
744
+ free_system_with_best_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(free_systems)].groupby('system')['WER'].mean().idxmin()
745
+ free_system_with_worst_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(free_systems)].groupby('system')['WER'].mean().idxmax()
746
+ commercial_system_with_best_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(commercial_systems)].groupby('system')['WER'].mean().idxmin()
747
+ commercial_system_with_worst_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(commercial_systems)].groupby('system')['WER'].mean().idxmax()
748
+
749
+ #print(f"Best free system: {free_system_with_best_wer}")
750
+ #print(f"Worst free system: {free_system_with_worst_wer}")
751
+ #print(f"Best commercial system: {commercial_system_with_best_wer}")
752
+ #print(f"Worst commercial system: {commercial_system_with_worst_wer}")
753
+
754
+ st.subheader("Comparison of WER for free and commercial systems")
755
+ # Best and worst system for free and commercial systems - print table
756
+ header = ["Type", "Best System", "Worst System"]
757
+ data = [
758
+ ["Free", free_system_with_best_wer, free_system_with_worst_wer],
759
+ ["Commercial", commercial_system_with_best_wer, commercial_system_with_worst_wer]
760
+ ]
761
+
762
+ st.subheader("Best and worst systems for dataset {}".format(dataset))
763
+ df_best_worse_systems = pd.DataFrame(data, columns=header)
764
+ # do not display index
765
+ st.dataframe(df_best_worse_systems)
766
+
767
+ st.subheader("Comparison of average WER for best systems")
768
+ df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
769
+ df_wer_avg_best_free_commercial = basic_stats_per_dimension(df_per_dataset_best_systems, "WER", "Type")
770
+ st.dataframe(df_wer_avg_best_free_commercial)
771
+
772
+ # Create lookup table to get system type based on its name
773
+ #system_type_lookup = dict(zip(df_wer_avg_per_system_all_subsets_with_type['system'], df_wer_avg_per_system_all_subsets_with_type['Type']))
774
+
775
+ systems_to_plot_best= [free_system_with_best_wer, commercial_system_with_best_wer]
776
+ plot_performance(systems_to_plot_best, df_wer_avg_per_system_all_subsets_with_type)
777
+
778
+ st.subheader("Comparison of average WER for the worst systems")
779
+ df_per_dataset_worst_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_worst_wer, commercial_system_with_worst_wer])]
780
+ df_wer_avg_worst_free_commercial = basic_stats_per_dimension(df_per_dataset_worst_systems, "WER", "Type")
781
+ st.dataframe(df_wer_avg_worst_free_commercial)
782
+
783
+ systems_to_plot_worst=[free_system_with_worst_wer, commercial_system_with_worst_wer]
784
+ plot_performance(systems_to_plot_worst, df_wer_avg_per_system_all_subsets_with_type)
785
+
786
+ # WER in function of model size
787
+ st.subheader("WER in function of model size for dataset {}".format(dataset))
788
+
789
+ # select only free systems for the analysis from df_wer_avg_per_system_all_subsets_with_type dataframe
790
+ free_systems_wer_per_subset = df_per_dataset_with_asr_systems_meta.groupby(['system', 'Parameters [M]', 'subset'])['WER'].mean().reset_index()
791
+ # sort by model size
792
+ # change column type Parameters [M] to integer
793
+ free_systems_wer_per_subset['Parameters [M]'] = free_systems_wer_per_subset['Parameters [M]'].astype(int)
794
+
795
+ free_systems_wer_per_subset = free_systems_wer_per_subset.sort_values(by='Parameters [M]')
796
+
797
+ free_systems_wer_average_across_all_subsets = free_systems_wer_per_subset.groupby(['system', 'Parameters [M]'])['WER'].mean().reset_index()
798
+ # change column type Parameters [M] to integer
799
+ free_systems_wer_average_across_all_subsets['Parameters [M]'] = free_systems_wer_average_across_all_subsets['Parameters [M]'].astype(int)
800
+
801
+ # sort by model size
802
+ free_systems_wer_average_across_all_subsets = free_systems_wer_average_across_all_subsets.sort_values(by='Parameters [M]')
803
+
804
+ free_systems_wer = free_systems_wer_average_across_all_subsets
805
+
806
+ # use system name as index
807
+ free_systems_wer_to_show = free_systems_wer.set_index('system')
808
+
809
+ # sort by WER and round WER by value to 2 decimal places
810
+ free_systems_wer_to_show = free_systems_wer_to_show.sort_values(by='WER').round({'WER': 2})
811
+
812
+ # print dataframe in streamlit with average WER, system name and model size
813
+ st.dataframe(free_systems_wer_to_show)
814
+
815
+ # plot scatter plot with values of WER
816
+ # X axis is the model size (parameters [M])
817
+ # Y is thw average WER
818
+ # make each point a different color
819
+ # provide legend with system names
820
+ fig, ax = plt.subplots()
821
+ for system in free_systems_wer['system'].unique():
822
+ subset = free_systems_wer[free_systems_wer['system'] == system]
823
+ ax.scatter(subset['Parameters [M]'], subset['WER'], label=system)
824
+ # Add text annotation for each point
825
+ for i, point in subset.iterrows():
826
+ ax.annotate(point['system'], (point['Parameters [M]'], point['WER']), textcoords="offset points", xytext=(-10,-10), ha='left', rotation=-30, fontsize=5)
827
+ ax.set_xlabel('Model Size [M]')
828
+ ax.set_ylabel('WER (%)')
829
+ ax.set_title('WER in function of model size')
830
+ # decrease font size of the legend and place it outside the plot
831
+ ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left')
832
+
833
+ st.pyplot(fig)
834
+
835
+ ##################################################################################################################################################
836
+ # WER per audio duration
837
+
838
+ # calculate average WER per audio duration bucket for the best and worse commercial and free systems
839
+ selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
840
+
841
+ # filter out results for selected systems
842
+ df_per_sample_selected_systems = df_per_sample[df_per_sample['system'].isin(selected_systems)]
843
+
844
+ # calculate average WER per audio duration for the best system
845
+ # add column with audio duration in seconds rounded to nearest integer value.
846
+ audio_duration_buckets = [1,2,3,4,5,10,15,20,30,40,50,60]
847
+ # map audio duration to the closest bucket
848
+ df_per_sample_selected_systems['audio_duration_buckets'] = df_per_sample_selected_systems['audio_duration'].apply(lambda x: min(audio_duration_buckets, key=lambda y: abs(x-y)))
849
+
850
+
851
+ # calculate average WER per audio duration bucket
852
+ df_per_sample_wer_audio = df_per_sample_selected_systems.groupby(['system', 'audio_duration_buckets'])['WER'].mean().reset_index()
853
+ # add column with number of samples for specific audio bucket size
854
+ df_per_sample_wer_audio['number_of_samples'] = df_per_sample_selected_systems.groupby(['system', 'audio_duration_buckets'])['WER'].count().values
855
+
856
+ df_per_sample_wer_audio = df_per_sample_wer_audio.sort_values(by='audio_duration_buckets')
857
+ # round values in WER column in df_per_sample_wer to 2 decimal places
858
+ df_per_sample_wer_audio['WER'].round(2)
859
+ # transform df_per_sample_wer. Use system values as columns, while audio_duration_buckets as main index
860
+ df_per_sample_wer_audio_pivot = df_per_sample_wer_audio.pivot(index='audio_duration_buckets', columns='system', values='WER')
861
+ df_per_sample_wer_audio_pivot = df_per_sample_wer_audio_pivot.round(2)
862
+
863
+ df_per_sample_wer_audio_pivot['number_of_samples'] = df_per_sample_wer_audio[df_per_sample_wer_audio['system']==free_system_with_best_wer].groupby('audio_duration_buckets')['number_of_samples'].sum().values
864
+
865
+ # put number_of_samples as the first column after index
866
+ df_per_sample_wer_audio_pivot = df_per_sample_wer_audio_pivot[['number_of_samples'] + [col for col in df_per_sample_wer_audio_pivot.columns if col != 'number_of_samples']]
867
+
868
+ # print dataframe in streamlit
869
+ st.dataframe(df_per_sample_wer_audio_pivot)
870
+
871
+ # plot scatter plot with values from df_per_sample_wer_pivot.
872
+ # each system should have a different color
873
+ # the size of the point should be proportional to the number of samples in the bucket
874
+ # the x axis should be the audio duration bucket
875
+ # the y axis should be the average WER
876
+ fig, ax = plt.subplots()
877
+ for system in selected_systems:
878
+ subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
879
+ ax.scatter(subset['audio_duration_buckets'], subset['WER'], label=system, s=subset['number_of_samples']*0.5)
880
+ ax.set_xlabel('Audio Duration [s]')
881
+ ax.set_ylabel('WER (%)')
882
+ ax.set_title('WER in function of audio duration.')
883
+
884
+ # place legend outside the plot on the right
885
+ ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left')
886
+ st.pyplot(fig)
887
+
888
+ ##################################################################################################################################################
889
+ # WER per speech rate
890
+
891
+
892
+ # speech rate chars unique values
893
+ audio_feature_to_analyze = 'speech_rate_words'
894
+ audio_feature_unit = ' [words/s]'
895
+ metric = 'WER'
896
+ metric_unit = ' [%]'
897
+ no_of_buckets = 10
898
+ # calculate average WER per audio duration bucket for the best and worse commercial and free systems
899
+ selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
900
+
901
+ df_per_sample_wer_feature_pivot, df_per_sample_wer_feature = calculate_wer_per_audio_feature(df_per_sample, selected_systems, audio_feature_to_analyze, metric, no_of_buckets)
902
+
903
+ # print dataframe in streamlit
904
+ st.dataframe(df_per_sample_wer_feature_pivot)
905
+
906
+ # plot scatter plot with values from df_per_sample_wer_pivot.
907
+ # each system should have a different color
908
+ # the size of the point should be proportional to the number of samples in the bucket
909
+ # the x axis should be the audio duration bucket
910
+ # the y axis should be the average WER
911
+ fig, ax = plt.subplots()
912
+ for system in selected_systems:
913
+ subset = df_per_sample_wer_feature[df_per_sample_wer_feature['system'] == system]
914
+ ax.scatter(subset[audio_feature_to_analyze], subset[metric], label=system, s=subset['number_of_samples']*0.5)
915
+ ax.set_xlabel(audio_feature_to_analyze.replace('_',' ').capitalize() + audio_feature_unit)
916
+ ax.set_ylabel(metric + metric_unit)
917
+ ax.set_title('WER in function of speech rate.'.format(audio_feature_to_analyze))
918
+
919
+ # place legend outside the plot on the right
920
+ ax.legend(title='System', loc='best')
921
+ st.pyplot(fig)
922
+
923
+
924
+ ################################################################################################################################################
925
+ # WER PER GENDER
926
+
927
+ #selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer, free_system_with_worst_wer, commercial_system_with_worst_wer]
928
+ selected_systems = df_per_sample['system'].unique()
929
+
930
+ df_per_sample_wer_gender_pivot, df_available_samples_per_category_per_system, no_samples_per_category = calculate_wer_per_meta_category(df_per_sample, selected_systems, 'WER', 'speaker_gender')
931
+ #print(df_per_sample_wer_gender_pivot)
932
+ #print(no_samples_per_category)
933
+
934
+ # print dataframe in streamlit
935
+ st.write("Number of samples per category")
936
+ for system in selected_systems:
937
+ st.write(f"System: {system}")
938
+ df_available_samples_per_category = df_available_samples_per_category_per_system[system]
939
+ st.dataframe(df_available_samples_per_category)
940
+
941
+ st.write("Number of samples analyzed per category - {}".format(no_samples_per_category))
942
+ st.dataframe(df_per_sample_wer_gender_pivot)
943
+
944
+
945
+ #print(difference_values)
946
+ #print(selected_systems)
947
+
948
+ # create the scatter plot
949
+ # the x axis should be the systems from selected_systems
950
+ # the y axis should be the difference from difference_values
951
+ # each system should have a different color
952
+ fig, ax = plt.subplots()
953
+ difference_values = df_per_sample_wer_gender_pivot['Difference'][:-3]
954
+ selected_systems = df_per_sample_wer_gender_pivot.index[:-3]
955
+ ax.scatter(difference_values, selected_systems, c=range(len(selected_systems)), cmap='viridis')
956
+ ax.set_ylabel('ASR System')
957
+ ax.set_xlabel('Difference in WER across speaker gender')
958
+ ax.set_title('ASR systems perfomance bias for genders.')
959
+ # add labels with difference in WER values
960
+ for i, txt in enumerate(difference_values):
961
+ ax.annotate(txt, (difference_values[i], selected_systems[i]), fontsize=5, ha='right')
962
+ st.pyplot(fig)
963
+
964
+ #####################################################################################################################################################################################
965
+ # WER per age
966
+ df_per_sample_wer_age_pivot, df_available_samples_per_category_per_system, no_samples_per_category = calculate_wer_per_meta_category(df_per_sample, selected_systems,'WER','speaker_age')
967
+ #print(df_per_sample_wer_age_pivot)
968
+ #print(no_samples_per_category)
969
+
970
+ # print dataframe in streamlit
971
+ st.write("Number of samples per category")
972
+ for system in selected_systems:
973
+ st.write(f"System: {system}")
974
+ df_available_samples_per_category = df_available_samples_per_category_per_system[system]
975
+ st.dataframe(df_available_samples_per_category)
976
+
977
+ st.write("Number of samples analyzed per category - {}".format(no_samples_per_category))
978
+
979
+ st.write("WER per age")
980
+ st.dataframe(df_per_sample_wer_age_pivot)
981
+
982
+ # extract columns from df_per_sample_wer_age_pivot for selected_systems (skip the last 3 values corresponding to median, average and std values)
983
+
984
+ #print(selected_systems)
985
+
986
+ # create the scatter plot
987
+ # the x axis should be the systems from selected_systems
988
+ # the y axis should be the difference from difference_values
989
+ # each system should have a different color
990
+ fig, ax = plt.subplots()
991
+ difference_values = df_per_sample_wer_age_pivot['Std Dev'][:-3]
992
+ selected_systems = df_per_sample_wer_age_pivot.index[:-3]
993
+ ax.scatter(difference_values,selected_systems , c=range(len(selected_systems)), cmap='viridis')
994
+ ax.set_ylabel('ASR System')
995
+ ax.set_xlabel('Standard Deviation in WER across speaker age')
996
+ ax.set_title('ASR systems perfomance bias for age groups')
997
+ # add labels with difference in WER values
998
+ for i, txt in enumerate(difference_values):
999
+ ax.annotate(txt, (difference_values[i], selected_systems[i]), fontsize=5, ha='right')
1000
+ st.pyplot(fig)
1001
+
1002
+ # READ vs CONVERSIONAL SPEECH AVERAGE WER
1003
+
1004
+ # Hallucinations rate per system
1005
+
1006
+
1007
+
1008
+ with inspection:
1009
+ st.title("Browse and manually inspect evaluation corpora and ASR results")
1010
+ st.markdown(INSPECTION_INFO, unsafe_allow_html=True)
1011
+ # TODO - load and display analysis and insights
1012
+ # filter dataset by audio id, type, ref/hyp content, ref/hyp length, words/chars per second etc.
1013
+ # playback audio
1014
+ # https://docs.streamlit.io/library/api-reference/media/st.audio
1015
+
1016
+ datasets = [
1017
+ "amu-cai/pl-asr-bigos-v2-secret",
1018
+ "pelcra/pl-asr-pelcra-for-bigos-secret",
1019
+ "amu-cai/pl-asr-bigos-v2-diagnostic",
1020
+ "amu-cai/pl-asr-bigos-v2-med"]
1021
+
1022
+ st.title("Data for qualitative analysis")
1023
+
1024
+ # select the dataset to display results
1025
+ dataset = st.selectbox("Select Dataset", datasets, key="dataset_inspection")
1026
+
1027
+ # read the latest results for the selected dataset
1028
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
1029
+
1030
+ # read available options to analyze for specific dataset
1031
+ splits = list(df_per_dataset_all['subset'].unique()) # Get the unique splits
1032
+ norm_types = list(df_per_dataset_all['norm_type'].unique()) # Get the unique norm_types
1033
+ ref_types = list(df_per_dataset_all['ref_type'].unique()) # Get the unique ref_types
1034
+ systems = list(df_per_dataset_all['system'].unique()) # Get the unique systems
1035
+ metrics = list(df_per_dataset_all.columns[7:]) # Get the unique metrics
1036
+
1037
+ # Select the system to display. More than 1 system can be selected.
1038
+ systems_selected = st.multiselect("Select ASR Systems", systems, key="systems_inspection", default=systems[:2])
1039
+
1040
+ # Select the metric to display
1041
+ metric = st.selectbox("Select Metric", metrics, index=metrics.index('WER'), key="metric_inspection")
1042
+
1043
+ # Select the normalization type
1044
+ norm_type = st.selectbox("Select Normalization Type", norm_types, index=norm_types.index('all'), key="norm_type_inspection")
1045
+ # Select the reference type
1046
+ ref_type = st.selectbox("Select Reference Type", ref_types, index=ref_types.index('orig'), key="ref_type_inspection")
1047
+
1048
+ num_of_samples = st.slider("Select number of samples to display", 1, 100, 10)
1049
+
1050
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type) & (df_per_sample_all["system"].isin(systems_selected))]
1051
+ # drop columns dataset
1052
+ #df_per_sample = df_per_sample.drop(columns=['dataset'])
1053
+
1054
+ # print 20 refs and hyps with the worse WER per sample
1055
+ st.subheader("Samples with the worst WER per sample")
1056
+ df_per_sample_worst_wer = df_per_sample.sort_values(by='WER', ascending=False).head(num_of_samples)
1057
+ # use full width of the screen to display dataframe
1058
+ st.dataframe(df_per_sample_worst_wer, use_container_width=True)
1059
+
1060
+
1061
+ # ALL as the concatenation
1062
+ # common functions, difference only in the input TSV
1063
+
app.py ADDED
@@ -0,0 +1,840 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import pandas as pd
4
+ from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
5
+ from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
6
+ from app_utils import calculate_height_to_display, filter_dataframe
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+
10
+ hf_token = os.getenv('HF_TOKEN')
11
+ if hf_token is None:
12
+ raise ValueError("HF_TOKEN environment variable is not set. Please check your secrets settings.")
13
+
14
+ # Tabs
15
+ # About (description of the benchmark) - methodology
16
+ # Leaderboards
17
+ # Interactive analysis
18
+ # Acknowledgements
19
+
20
+ # select the dataset to display results
21
+ datasets_secret = [
22
+ "amu-cai/pl-asr-bigos-v2-secret",
23
+ "pelcra/pl-asr-pelcra-for-bigos-secret"]
24
+
25
+ datasets_public = []
26
+ #["amu-cai/pl-asr-bigos-synth-med"]
27
+ #amu-cai/pl-asr-bigos-v2-diagnostic"
28
+
29
+ st.set_page_config(layout="wide")
30
+
31
+ about, lead_bigos, lead_pelcra, analysis, interactive_comparison = st.tabs(["About", "ASR Leaderboard - BIGOS corpora", "ASR Leaderboard - PELCRA corpora", "ASR evaluation scenarios", "Interactive comparison of ASR accuracy"])
32
+ # "Results inspection""Results inspection"
33
+ # inspection
34
+ # acknowledgements, changelog, faq, todos = st.columns(4)
35
+ #lead_bigos_diagnostic, lead_bigos_synth
36
+
37
+ cols_to_select_all = ["system", "subset", "ref_type", "norm_type", "SER", "MER", "WER", "CER"]
38
+
39
+ def plot_performance(systems_to_plot, df_per_system_with_type):
40
+ # Get unique subsets
41
+ subsets = df_per_system_with_type['subset'].unique()
42
+
43
+ # Create a color and label map
44
+ color_label_map = {
45
+ free_system_with_best_wer: ('blue', 'Best Free'),
46
+ free_system_with_worst_wer: ('red', 'Worst Free'),
47
+ commercial_system_with_best_wer: ('green', 'Best Paid'),
48
+ commercial_system_with_worst_wer: ('orange', 'Worst Paid')
49
+ }
50
+
51
+ # Plot the data
52
+ fig, ax = plt.subplots(figsize=(14, 7))
53
+
54
+ bar_width = 0.3
55
+ index = np.arange(len(subsets))
56
+
57
+ for i, system in enumerate(systems_to_plot):
58
+ subset_wer = df_per_system_with_type[df_per_system_with_type['system'] == system].set_index('subset')['WER']
59
+ color, label = color_label_map[system]
60
+ ax.bar(index + i * bar_width, subset_wer.loc[subsets], bar_width, label=label + ' - ' + system, color=color)
61
+
62
+ # Adding labels and title
63
+ ax.set_xlabel('Subset')
64
+ ax.set_ylabel('WER (%)')
65
+ ax.set_title('Comparison of performance of ASR systems.')
66
+ ax.set_xticks(index + bar_width * 1.5)
67
+ ax.set_xticklabels(subsets, rotation=90, ha='right')
68
+ ax.legend()
69
+
70
+ st.pyplot(fig)
71
+
72
+ def round_to_nearest(value, multiple):
73
+ return multiple * round(value / multiple)
74
+
75
+ def create_bar_chart(df, systems, metric, norm_type, ref_type='orig', orientation='vertical'):
76
+ df = df[df['norm_type'] == norm_type]
77
+ df = df[df['ref_type'] == ref_type]
78
+
79
+ # Prepare the data for the bar chart
80
+ subsets = df['subset'].unique()
81
+ num_vars = len(subsets)
82
+ bar_width = 0.2 # Width of the bars
83
+
84
+ fig, ax = plt.subplots(figsize=(10, 10))
85
+
86
+ max_value_all_systems = 0
87
+ for i, system in enumerate(systems):
88
+ system_data = df[df['system'] == system]
89
+ max_value_for_system = max(system_data[metric])
90
+ if max_value_for_system > max_value_all_systems:
91
+ max_value_all_systems = round_to_nearest(max_value_for_system + 2, 10)
92
+
93
+ # Ensure the system data is in the same order as subsets
94
+ values = []
95
+ for subset in subsets:
96
+ subset_value = system_data[system_data['subset'] == subset][metric].values
97
+ if len(subset_value) > 0:
98
+ values.append(subset_value[0])
99
+ else:
100
+ values.append(0) # Append 0 if the subset value is missing
101
+
102
+ if orientation == 'vertical':
103
+ # Plot each system's bars with an offset for vertical orientation
104
+ x_pos = np.arange(len(subsets)) + i * bar_width
105
+ ax.bar(x_pos, values, bar_width, label=system)
106
+ # Add value labels
107
+ for j, value in enumerate(values):
108
+ ax.text(x_pos[j], value + max(values) * 0.03, f'{value}', ha='center', va='bottom',fontsize=6)
109
+ else:
110
+ # Plot each system's bars with an offset for horizontal orientation
111
+ y_pos = np.arange(len(subsets)) + i * bar_width
112
+ ax.barh(y_pos, values, bar_width, label=system)
113
+ # Add value labels
114
+ for j, value in enumerate(values):
115
+ ax.text(value + max(values) * 0.03, y_pos[j], f'{value}', ha='left', va='center', fontsize=6)
116
+
117
+ if orientation == 'vertical':
118
+ ax.set_xticks(np.arange(len(subsets)) + bar_width * (len(systems) - 1) / 2)
119
+ ax.set_xticklabels(subsets, rotation=45, ha='right')
120
+ ax.set_ylabel(metric)
121
+ else:
122
+ ax.set_yticks(np.arange(len(subsets)) + bar_width * (len(systems) - 1) / 2)
123
+ ax.set_yticklabels(subsets)
124
+ ax.set_xlabel(metric)
125
+
126
+ # Add grid values for the vertical and horizontal bar plots
127
+ if orientation == 'vertical':
128
+ ax.set_yticks(np.linspace(0, max_value_all_systems, 5))
129
+ else:
130
+ ax.set_xticks(np.linspace(0, max_value_all_systems, 5))
131
+
132
+ # Put legend on the right side outside of the plot
133
+ plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), shadow=True, ncol=1)
134
+
135
+ st.pyplot(fig)
136
+
137
+ def create_radar_plot(df, enable_labels, systems, metric, norm_type, ref_type='orig'):
138
+
139
+ df = df[df['norm_type'] == norm_type]
140
+ df = df[df['ref_type'] == ref_type]
141
+
142
+ # Prepare the data for the radar plot
143
+ #systems = df['system'].unique()
144
+ subsets = df['subset'].unique()
145
+ num_vars = len(subsets)
146
+
147
+ angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
148
+ angles += angles[:1] # Complete the loop
149
+
150
+ fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
151
+
152
+ max_value_all_systems = 0
153
+ for system in systems:
154
+ system_data = df[df['system'] == system]
155
+ max_value_for_system = max(system_data[metric])
156
+ if max_value_for_system > max_value_all_systems:
157
+ max_value_all_systems = round_to_nearest(max_value_for_system + 2, 10)
158
+
159
+ # Ensure the system data is in the same order as subsets
160
+ values = []
161
+ for subset in subsets:
162
+ subset_value = system_data[system_data['subset'] == subset][metric].values
163
+ if len(subset_value) > 0:
164
+ values.append(subset_value[0])
165
+ else:
166
+ values.append(0) # Append 0 if the subset value is missing
167
+
168
+ values += values[:1] # Complete the loop
169
+
170
+ # Plot each system
171
+ ax.plot(angles, values, label=system)
172
+ ax.fill(angles, values, alpha=0.25)
173
+
174
+ # Add value labels
175
+ for angle, value in zip(angles, values):
176
+ ax.text(angle, value + max(values) * 0.01, f'{value}', ha='center', va='center', fontsize=6)
177
+
178
+ ax.set_xticklabels(subsets)
179
+
180
+ ax.set_yticks(np.linspace(0, max_value_all_systems, 5))
181
+
182
+ # put legend at the bottom of the page
183
+ plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1), shadow=True, ncol=1)
184
+
185
+ st.pyplot(fig)
186
+
187
+ with about:
188
+ st.title("About BIGOS benchmark")
189
+ st.markdown(ABOUT_INFO, unsafe_allow_html=True)
190
+ # TODO - load and display about BIGOS benchmark
191
+
192
+ # Table - evaluated systems # TODO - change to concatenated table
193
+ st.header("Evaluated ASR systems")
194
+ dataset = "amu-cai/pl-asr-bigos-v2-secret"
195
+ split = "test"
196
+ df_per_sample, df_per_dataset = read_latest_results(dataset, split, codename_to_shortname_mapping=None)
197
+ evaluated_systems_list = df_per_sample["system"].unique()
198
+ #print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
199
+
200
+ df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
201
+ codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
202
+ #print(codename_to_shortname_mapping)
203
+
204
+ h_df_systems = calculate_height_to_display(df_evaluated_systems)
205
+
206
+ df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
207
+ df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
208
+ st.write("Evaluated ASR systems types")
209
+
210
+ st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
211
+
212
+ st.write("Evaluated ASR systems details")
213
+
214
+ #TODO - add info who created the system (company, institution, team, etc.)
215
+ st.dataframe(df_evaluated_systems, hide_index=True, height = h_df_systems, use_container_width=True)
216
+
217
+ # Table - evaluation datasets
218
+ # Table - evaluation metrics
219
+ # Table - evaluation metadata
220
+ # List - references
221
+ # List - contact points
222
+ # List - acknowledgements
223
+ # List - changelog
224
+ # List - FAQ
225
+ # List - TODOs
226
+
227
+ with lead_bigos:
228
+
229
+ # configuration for tab
230
+ dataset = "amu-cai/pl-asr-bigos-v2-secret"
231
+ dataset_short_name = "BIGOS"
232
+ dataset_version = "V2"
233
+ eval_date = "March 2024"
234
+ split = "test"
235
+ norm_type = "all"
236
+ ref_type = "orig"
237
+
238
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
239
+ #### DATA LOADING AND AUGMENTATION ####
240
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
241
+
242
+
243
+ # filter only the ref_type and norm_type we want to analyze
244
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
245
+ # filter only the ref_type and norm_type we want to analyze
246
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
247
+
248
+ ##### PARAMETERS CALCULATION ####
249
+ evaluated_systems_list = df_per_sample["system"].unique()
250
+ no_of_evaluated_systems = len(evaluated_systems_list)
251
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
252
+ no_of_test_cases = len(df_per_sample)
253
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
254
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
255
+ no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
256
+
257
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
258
+
259
+ ########### EVALUATION PARAMETERS PRESENTATION ################
260
+ st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
261
+ st.markdown(BIGOS_INFO, unsafe_allow_html=True)
262
+ st.markdown("**Evaluation date:** {}".format(eval_date))
263
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
264
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
265
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
266
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
267
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
268
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
269
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
270
+ st.markdown("**Dataset:** {}".format(dataset))
271
+ st.markdown("**Dataset version:** {}".format(dataset_version))
272
+ st.markdown("**Split:** {}".format(split))
273
+ st.markdown("**Text reference type:** {}".format(ref_type))
274
+ st.markdown("**Normalization steps:** {}".format(norm_type))
275
+
276
+ ########### RESULTS ################
277
+ st.header("WER (Word Error Rate) analysis")
278
+ st.subheader("Average WER for the whole dataset")
279
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
280
+ st.dataframe(df_wer_avg)
281
+
282
+ st.subheader("Comparison of average WER for free and commercial systems")
283
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
284
+ st.dataframe(df_wer_avg_free_commercial)
285
+
286
+
287
+ ##################### PER SYSTEM ANALYSIS #########################
288
+ analysis_dim = "system"
289
+ metric = "WER"
290
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
291
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
292
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
293
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
294
+
295
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
296
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
297
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
298
+
299
+ ##################### PER SUBSET ANALYSIS #########################
300
+ analysis_dim = "subset"
301
+ metric = "WER"
302
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
303
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
304
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
305
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
306
+
307
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
308
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
309
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
310
+
311
+ ### IMPACT OF NORMALIZATION ON ERROR RATES #####
312
+ # Calculate the average impact of various norm_types for all datasets and systems
313
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
314
+ diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols)
315
+ st.subheader("Impact of normalization of references and hypothesis on evaluation metrics")
316
+ st.dataframe(diff_in_metrics, use_container_width=False)
317
+
318
+ # Visualizing the differences in metrics graphically with data labels
319
+ # Visualizing the differences in metrics graphically with data labels
320
+ fig, axs = plt.subplots(3, 2, figsize=(12, 12))
321
+ fig.subplots_adjust(hspace=0.6, wspace=0.6)
322
+
323
+ #remove the sixth subplot
324
+ fig.delaxes(axs[2,1])
325
+
326
+ metrics = ['SER', 'WER', 'MER', 'CER', "Average"]
327
+ colors = ['blue', 'orange', 'green', 'red', 'purple']
328
+
329
+ for ax, metric, color in zip(axs.flatten(), metrics, colors):
330
+ bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color)
331
+ ax.set_title(f'Normalization impact on {metric}')
332
+ if metric == 'Average':
333
+ ax.set_title('Average normalization impact on all metrics')
334
+ ax.set_xlabel('Normalization Type')
335
+ ax.set_ylabel(f'Difference in {metric}')
336
+ ax.grid(True)
337
+ ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
338
+ min_val = diff_in_metrics[metric].min()
339
+ ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1])
340
+
341
+ for bar in bars:
342
+ height = bar.get_height()
343
+ ax.annotate(f'{height:.2f}',
344
+ xy=(bar.get_x() + bar.get_width() / 2, height),
345
+ xytext=(0, -12), # 3 points vertical offset
346
+ textcoords="offset points",
347
+ ha='center', va='bottom')
348
+
349
+
350
+ # Display the plot in Streamlit
351
+ st.pyplot(fig)
352
+
353
+ ##################### APPENDIX #########################
354
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
355
+ # select only the columns we want to plot
356
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
357
+
358
+ with lead_pelcra:
359
+ st.title("PELCRA Leaderboard")
360
+ st.markdown(PELCRA_INFO, unsafe_allow_html=True)
361
+
362
+ # configuration for tab
363
+ dataset = "pelcra/pl-asr-pelcra-for-bigos-secret"
364
+ dataset_short_name = "PELCRA"
365
+ dataset_version = "V1"
366
+ eval_date = "March 2024"
367
+ split = "test"
368
+ norm_type = "all"
369
+ ref_type = "orig"
370
+
371
+ # common, reusable part for all tabs presenting leaderboards for specific datasets
372
+ #### DATA LOADING AND AUGMENTATION ####
373
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
374
+
375
+
376
+ # filter only the ref_type and norm_type we want to analyze
377
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
378
+ # filter only the ref_type and norm_type we want to analyze
379
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
380
+
381
+ ##### PARAMETERS CALCULATION ####
382
+ evaluated_systems_list = df_per_sample["system"].unique()
383
+ no_of_evaluated_systems = len(evaluated_systems_list)
384
+ no_of_eval_subsets = len(df_per_dataset["subset"].unique())
385
+ no_of_test_cases = len(df_per_sample)
386
+ no_of_unique_recordings = len(df_per_sample["id"].unique())
387
+ total_audio_duration_hours = get_total_audio_duration(df_per_sample)
388
+ no_of_unique_speakers = len(df_per_sample["speaker_id"].unique())
389
+
390
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
391
+
392
+ ########### EVALUATION PARAMETERS PRESENTATION ################
393
+ st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
394
+ st.markdown(BIGOS_INFO, unsafe_allow_html=True)
395
+ st.markdown("**Evaluation date:** {}".format(eval_date))
396
+ st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
397
+ st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
398
+ st.markdown("**Number of evaluated system-model-subsets combinations**: {}".format(len(df_per_dataset)))
399
+ st.markdown("**Number of unique speakers**: {}".format(no_of_unique_speakers))
400
+ st.markdown("**Number of unique recordings used for evaluation:** {}".format(no_of_unique_recordings))
401
+ st.markdown("**Total size of the dataset:** {:.2f} hours".format(total_audio_duration_hours))
402
+ st.markdown("**Total number of test cases (audio-hypothesis pairs):** {}".format(no_of_test_cases))
403
+ st.markdown("**Dataset:** {}".format(dataset))
404
+ st.markdown("**Dataset version:** {}".format(dataset_version))
405
+ st.markdown("**Split:** {}".format(split))
406
+ st.markdown("**Text reference type:** {}".format(ref_type))
407
+ st.markdown("**Normalization steps:** {}".format(norm_type))
408
+
409
+ ########### RESULTS ################
410
+ st.header("WER (Word Error Rate) analysis")
411
+ st.subheader("Average WER for the whole dataset")
412
+ df_wer_avg = basic_stats_per_dimension(df_per_dataset, "WER", "dataset")
413
+ st.dataframe(df_wer_avg)
414
+
415
+ st.subheader("Comparison of average WER for free and commercial systems")
416
+ df_wer_avg_free_commercial = basic_stats_per_dimension(df_per_dataset_with_asr_systems_meta, "WER", "Type")
417
+ st.dataframe(df_wer_avg_free_commercial)
418
+
419
+ ##################### PER SYSTEM ANALYSIS #########################
420
+ analysis_dim = "system"
421
+ metric = "WER"
422
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
423
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
424
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
425
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
426
+
427
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
428
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
429
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
430
+
431
+ ##################### PER SUBSET ANALYSIS #########################
432
+ analysis_dim = "subset"
433
+ metric = "WER"
434
+ st.subheader("Table showing {} per {} sorted by median values".format(metric, analysis_dim))
435
+ df_wer_per_system_from_per_dataset = basic_stats_per_dimension(df_per_dataset, metric, analysis_dim)
436
+ h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
437
+ st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
438
+
439
+ st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
440
+ fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
441
+ st.pyplot(fig, clear_figure=True, use_container_width=True)
442
+
443
+ ### IMPACT OF NORMALIZATION ON ERROR RATES #####
444
+ # Calculate the average impact of various norm_types for all datasets and systems
445
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
446
+ diff_in_metrics = check_impact_of_normalization(df_per_dataset_selected_cols)
447
+ st.subheader("Impact of normalization on WER")
448
+ st.dataframe(diff_in_metrics, use_container_width=False)
449
+
450
+ # Visualizing the differences in metrics graphically with data labels
451
+ # Visualizing the differences in metrics graphically with data labels
452
+ fig, axs = plt.subplots(3, 2, figsize=(12, 12))
453
+ fig.subplots_adjust(hspace=0.6, wspace=0.6)
454
+
455
+ #remove the sixth subplot
456
+ fig.delaxes(axs[2,1])
457
+
458
+ metrics = ['SER', 'WER', 'MER', 'CER', "Average"]
459
+ colors = ['blue', 'orange', 'green', 'red', 'purple']
460
+
461
+ for ax, metric, color in zip(axs.flatten(), metrics, colors):
462
+ bars = ax.bar(diff_in_metrics.index, diff_in_metrics[metric], color=color)
463
+ ax.set_title(f'Normalization impact on {metric}')
464
+ if metric == 'Average':
465
+ ax.set_title('Average normalization impact on all metrics')
466
+ ax.set_xlabel('Normalization Type')
467
+ ax.set_ylabel(f'Difference in {metric}')
468
+ ax.grid(True)
469
+ ax.set_xticklabels(diff_in_metrics.index, rotation=45, ha='right')
470
+ min_val = diff_in_metrics[metric].min()
471
+ ax.set_ylim([min_val * 1.1, diff_in_metrics[metric].max() * 1.1])
472
+
473
+ for bar in bars:
474
+ height = bar.get_height()
475
+ ax.annotate(f'{height:.2f}',
476
+ xy=(bar.get_x() + bar.get_width() / 2, height),
477
+ xytext=(0, -12), # 3 points vertical offset
478
+ textcoords="offset points",
479
+ ha='center', va='bottom')
480
+
481
+ # Display the plot in Streamlit
482
+ st.pyplot(fig)
483
+
484
+ ##################### APPENDIX #########################
485
+ st.header("Appendix - Full evaluation results per subset for all evaluated systems")
486
+ # select only the columns we want to plot
487
+ df_per_dataset_selected_cols = df_per_dataset_all[cols_to_select_all]
488
+ st.dataframe(df_per_dataset_selected_cols, hide_index=True, use_container_width=False)
489
+
490
+ with analysis:
491
+ datasets = datasets_secret + datasets_public
492
+
493
+ dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_scenarios")
494
+
495
+ # read the latest results for the selected dataset
496
+ print("Reading the latest results for dataset: ", dataset)
497
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
498
+ # filter only the ref_type and norm_type we want to analyze
499
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
500
+ # filter only the ref_type and norm_type we want to analyze
501
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
502
+
503
+ evaluated_systems_list = df_per_sample["system"].unique()
504
+ print(evaluated_systems_list)
505
+ df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
506
+ print(df_evaluated_systems)
507
+
508
+
509
+ ##### ANALYSIS - COMMERCIAL VS FREE SYSTEMS #####
510
+ # Generate dataframe with columns as follows System Type Subset Avg_WER
511
+ df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
512
+
513
+ df_wer_avg_per_system_all_subsets_with_type = df_per_dataset_with_asr_systems_meta.groupby(['system', 'Type', 'subset'])['WER'].mean().reset_index()
514
+ print(df_wer_avg_per_system_all_subsets_with_type)
515
+
516
+ # Select the best and worse system for free and commercial systems
517
+ free_systems = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['Type'] == 'free']['system'].unique()
518
+ commercial_systems = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['Type'] == 'commercial']['system'].unique()
519
+ free_system_with_best_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(free_systems)].groupby('system')['WER'].mean().idxmin()
520
+ free_system_with_worst_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(free_systems)].groupby('system')['WER'].mean().idxmax()
521
+ commercial_system_with_best_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(commercial_systems)].groupby('system')['WER'].mean().idxmin()
522
+ commercial_system_with_worst_wer = df_wer_avg_per_system_all_subsets_with_type[df_wer_avg_per_system_all_subsets_with_type['system'].isin(commercial_systems)].groupby('system')['WER'].mean().idxmax()
523
+
524
+ #print(f"Best free system: {free_system_with_best_wer}")
525
+ #print(f"Worst free system: {free_system_with_worst_wer}")
526
+ #print(f"Best commercial system: {commercial_system_with_best_wer}")
527
+ #print(f"Worst commercial system: {commercial_system_with_worst_wer}")
528
+
529
+ st.subheader("Comparison of WER for free and commercial systems")
530
+ # Best and worst system for free and commercial systems - print table
531
+ header = ["Type", "Best System", "Worst System"]
532
+ data = [
533
+ ["Free", free_system_with_best_wer, free_system_with_worst_wer],
534
+ ["Commercial", commercial_system_with_best_wer, commercial_system_with_worst_wer]
535
+ ]
536
+
537
+ st.subheader("Best and worst systems for dataset {}".format(dataset))
538
+ df_best_worse_systems = pd.DataFrame(data, columns=header)
539
+ # do not display index
540
+ st.dataframe(df_best_worse_systems)
541
+
542
+ st.subheader("Comparison of average WER for best systems")
543
+ df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
544
+ df_wer_avg_best_free_commercial = basic_stats_per_dimension(df_per_dataset_best_systems, "WER", "Type")
545
+ st.dataframe(df_wer_avg_best_free_commercial)
546
+
547
+ # Create lookup table to get system type based on its name
548
+ #system_type_lookup = dict(zip(df_wer_avg_per_system_all_subsets_with_type['system'], df_wer_avg_per_system_all_subsets_with_type['Type']))
549
+
550
+ systems_to_plot_best= [free_system_with_best_wer, commercial_system_with_best_wer]
551
+ plot_performance(systems_to_plot_best, df_wer_avg_per_system_all_subsets_with_type)
552
+
553
+ st.subheader("Comparison of average WER for the worst systems")
554
+ df_per_dataset_worst_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_worst_wer, commercial_system_with_worst_wer])]
555
+ df_wer_avg_worst_free_commercial = basic_stats_per_dimension(df_per_dataset_worst_systems, "WER", "Type")
556
+ st.dataframe(df_wer_avg_worst_free_commercial)
557
+
558
+ systems_to_plot_worst=[free_system_with_worst_wer, commercial_system_with_worst_wer]
559
+ plot_performance(systems_to_plot_worst, df_wer_avg_per_system_all_subsets_with_type)
560
+
561
+ # WER in function of model size
562
+ st.subheader("WER in function of model size for dataset {}".format(dataset))
563
+
564
+ # select only free systems for the analysis from df_wer_avg_per_system_all_subsets_with_type dataframe
565
+ free_systems_wer_per_subset = df_per_dataset_with_asr_systems_meta.groupby(['system', 'Parameters [M]', 'subset'])['WER'].mean().reset_index()
566
+ # sort by model size
567
+ # change column type Parameters [M] to integer
568
+ free_systems_wer_per_subset['Parameters [M]'] = free_systems_wer_per_subset['Parameters [M]'].astype(int)
569
+
570
+ free_systems_wer_per_subset = free_systems_wer_per_subset.sort_values(by='Parameters [M]')
571
+
572
+ free_systems_wer_average_across_all_subsets = free_systems_wer_per_subset.groupby(['system', 'Parameters [M]'])['WER'].mean().reset_index()
573
+ # change column type Parameters [M] to integer
574
+ free_systems_wer_average_across_all_subsets['Parameters [M]'] = free_systems_wer_average_across_all_subsets['Parameters [M]'].astype(int)
575
+
576
+ # sort by model size
577
+ free_systems_wer_average_across_all_subsets = free_systems_wer_average_across_all_subsets.sort_values(by='Parameters [M]')
578
+
579
+ free_systems_wer = free_systems_wer_average_across_all_subsets
580
+
581
+ # use system name as index
582
+ free_systems_wer_to_show = free_systems_wer.set_index('system')
583
+
584
+ # sort by WER and round WER by value to 2 decimal places
585
+ free_systems_wer_to_show = free_systems_wer_to_show.sort_values(by='WER').round({'WER': 2})
586
+
587
+ # print dataframe in streamlit with average WER, system name and model size
588
+ st.dataframe(free_systems_wer_to_show)
589
+
590
+ # plot scatter plot with values of WER
591
+ # X axis is the model size (parameters [M])
592
+ # Y is thw average WER
593
+ # make each point a different color
594
+ # provide legend with system names
595
+ fig, ax = plt.subplots()
596
+ for system in free_systems_wer['system'].unique():
597
+ subset = free_systems_wer[free_systems_wer['system'] == system]
598
+ ax.scatter(subset['Parameters [M]'], subset['WER'], label=system)
599
+ # Add text annotation for each point
600
+ for i, point in subset.iterrows():
601
+ ax.annotate(point['system'], (point['Parameters [M]'], point['WER']), textcoords="offset points", xytext=(-10,-10), ha='left', rotation=-30, fontsize=5)
602
+ ax.set_xlabel('Model Size [M]')
603
+ ax.set_ylabel('WER (%)')
604
+ ax.set_title('WER in function of model size')
605
+ # decrease font size of the legend and place it outside the plot
606
+ ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left')
607
+
608
+ st.pyplot(fig)
609
+
610
+ ##################################################################################################################################################
611
+ # WER per audio duration
612
+
613
+ # calculate average WER per audio duration bucket for the best and worse commercial and free systems
614
+ selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
615
+
616
+ # filter out results for selected systems
617
+ df_per_sample_selected_systems = df_per_sample[df_per_sample['system'].isin(selected_systems)]
618
+
619
+ # calculate average WER per audio duration for the best system
620
+ # add column with audio duration in seconds rounded to nearest integer value.
621
+ audio_duration_buckets = [1,2,3,4,5,10,15,20,30,40,50,60]
622
+ # map audio duration to the closest bucket
623
+ df_per_sample_selected_systems['audio_duration_buckets'] = df_per_sample_selected_systems['audio_duration'].apply(lambda x: min(audio_duration_buckets, key=lambda y: abs(x-y)))
624
+
625
+
626
+ # calculate average WER per audio duration bucket
627
+ df_per_sample_wer_audio = df_per_sample_selected_systems.groupby(['system', 'audio_duration_buckets'])['WER'].mean().reset_index()
628
+ # add column with number of samples for specific audio bucket size
629
+ df_per_sample_wer_audio['number_of_samples'] = df_per_sample_selected_systems.groupby(['system', 'audio_duration_buckets'])['WER'].count().values
630
+
631
+ df_per_sample_wer_audio = df_per_sample_wer_audio.sort_values(by='audio_duration_buckets')
632
+ # round values in WER column in df_per_sample_wer to 2 decimal places
633
+ df_per_sample_wer_audio['WER'].round(2)
634
+ # transform df_per_sample_wer. Use system values as columns, while audio_duration_buckets as main index
635
+ df_per_sample_wer_audio_pivot = df_per_sample_wer_audio.pivot(index='audio_duration_buckets', columns='system', values='WER')
636
+ df_per_sample_wer_audio_pivot = df_per_sample_wer_audio_pivot.round(2)
637
+
638
+ df_per_sample_wer_audio_pivot['number_of_samples'] = df_per_sample_wer_audio[df_per_sample_wer_audio['system']==free_system_with_best_wer].groupby('audio_duration_buckets')['number_of_samples'].sum().values
639
+
640
+ # put number_of_samples as the first column after index
641
+ df_per_sample_wer_audio_pivot = df_per_sample_wer_audio_pivot[['number_of_samples'] + [col for col in df_per_sample_wer_audio_pivot.columns if col != 'number_of_samples']]
642
+
643
+ # print dataframe in streamlit
644
+ st.dataframe(df_per_sample_wer_audio_pivot)
645
+
646
+ # plot scatter plot with values from df_per_sample_wer_pivot.
647
+ # each system should have a different color
648
+ # the size of the point should be proportional to the number of samples in the bucket
649
+ # the x axis should be the audio duration bucket
650
+ # the y axis should be the average WER
651
+ fig, ax = plt.subplots()
652
+ for system in selected_systems:
653
+ subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
654
+ ax.scatter(subset['audio_duration_buckets'], subset['WER'], label=system, s=subset['number_of_samples']*0.5)
655
+ ax.set_xlabel('Audio Duration [s]')
656
+ ax.set_ylabel('WER (%)')
657
+ ax.set_title('WER in function of audio duration.')
658
+
659
+ # place legend outside the plot on the right
660
+ ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left')
661
+ st.pyplot(fig)
662
+
663
+ ##################################################################################################################################################
664
+ # WER per speech rate
665
+
666
+
667
+ # speech rate chars unique values
668
+ audio_feature_to_analyze = 'speech_rate_words'
669
+ audio_feature_unit = ' [words/s]'
670
+ metric = 'WER'
671
+ metric_unit = ' [%]'
672
+ no_of_buckets = 10
673
+ # calculate average WER per audio duration bucket for the best and worse commercial and free systems
674
+ selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
675
+
676
+ df_per_sample_wer_feature_pivot, df_per_sample_wer_feature = calculate_wer_per_audio_feature(df_per_sample, selected_systems, audio_feature_to_analyze, metric, no_of_buckets)
677
+
678
+ # print dataframe in streamlit
679
+ st.dataframe(df_per_sample_wer_feature_pivot)
680
+
681
+ # plot scatter plot with values from df_per_sample_wer_pivot.
682
+ # each system should have a different color
683
+ # the size of the point should be proportional to the number of samples in the bucket
684
+ # the x axis should be the audio duration bucket
685
+ # the y axis should be the average WER
686
+ fig, ax = plt.subplots()
687
+ for system in selected_systems:
688
+ subset = df_per_sample_wer_feature[df_per_sample_wer_feature['system'] == system]
689
+ ax.scatter(subset[audio_feature_to_analyze], subset[metric], label=system, s=subset['number_of_samples']*0.5)
690
+ ax.set_xlabel(audio_feature_to_analyze.replace('_',' ').capitalize() + audio_feature_unit)
691
+ ax.set_ylabel(metric + metric_unit)
692
+ ax.set_title('WER in function of speech rate.'.format(audio_feature_to_analyze))
693
+
694
+ # place legend outside the plot on the right
695
+ ax.legend(title='System', loc='best')
696
+ st.pyplot(fig)
697
+
698
+
699
+ ################################################################################################################################################
700
+ # WER PER GENDER
701
+
702
+ #selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer, free_system_with_worst_wer, commercial_system_with_worst_wer]
703
+ selected_systems = df_per_sample['system'].unique()
704
+
705
+ df_per_sample_wer_gender_pivot, df_available_samples_per_category_per_system, no_samples_per_category = calculate_wer_per_meta_category(df_per_sample, selected_systems, 'WER', 'speaker_gender')
706
+ #print(df_per_sample_wer_gender_pivot)
707
+ #print(no_samples_per_category)
708
+
709
+ # print dataframe in streamlit
710
+ st.write("Number of samples per category")
711
+ for system in selected_systems:
712
+ st.write(f"System: {system}")
713
+ df_available_samples_per_category = df_available_samples_per_category_per_system[system]
714
+ st.dataframe(df_available_samples_per_category)
715
+
716
+ st.write("Number of samples analyzed per category - {}".format(no_samples_per_category))
717
+ st.dataframe(df_per_sample_wer_gender_pivot)
718
+
719
+
720
+ #print(difference_values)
721
+ #print(selected_systems)
722
+
723
+ # create the scatter plot
724
+ # the x axis should be the systems from selected_systems
725
+ # the y axis should be the difference from difference_values
726
+ # each system should have a different color
727
+ fig, ax = plt.subplots()
728
+ difference_values = df_per_sample_wer_gender_pivot['Difference'][:-3]
729
+ selected_systems = df_per_sample_wer_gender_pivot.index[:-3]
730
+ ax.scatter(difference_values, selected_systems, c=range(len(selected_systems)), cmap='viridis')
731
+ ax.set_ylabel('ASR System')
732
+ ax.set_xlabel('Difference in WER across speaker gender')
733
+ ax.set_title('ASR systems perfomance bias for genders.')
734
+ # add labels with difference in WER values
735
+ for i, txt in enumerate(difference_values):
736
+ ax.annotate(txt, (difference_values[i], selected_systems[i]), fontsize=5, ha='right')
737
+ st.pyplot(fig)
738
+
739
+ #####################################################################################################################################################################################
740
+ # WER per age
741
+ df_per_sample_wer_age_pivot, df_available_samples_per_category_per_system, no_samples_per_category = calculate_wer_per_meta_category(df_per_sample, selected_systems,'WER','speaker_age')
742
+ #print(df_per_sample_wer_age_pivot)
743
+ #print(no_samples_per_category)
744
+
745
+ # print dataframe in streamlit
746
+ st.write("Number of samples per category")
747
+ for system in selected_systems:
748
+ st.write(f"System: {system}")
749
+ df_available_samples_per_category = df_available_samples_per_category_per_system[system]
750
+ st.dataframe(df_available_samples_per_category)
751
+
752
+ st.write("Number of samples analyzed per category - {}".format(no_samples_per_category))
753
+
754
+ st.write("WER per age")
755
+ st.dataframe(df_per_sample_wer_age_pivot)
756
+
757
+ # extract columns from df_per_sample_wer_age_pivot for selected_systems (skip the last 3 values corresponding to median, average and std values)
758
+
759
+ #print(selected_systems)
760
+
761
+ # create the scatter plot
762
+ # the x axis should be the systems from selected_systems
763
+ # the y axis should be the difference from difference_values
764
+ # each system should have a different color
765
+ fig, ax = plt.subplots()
766
+ difference_values = df_per_sample_wer_age_pivot['Std Dev'][:-3]
767
+ selected_systems = df_per_sample_wer_age_pivot.index[:-3]
768
+ ax.scatter(difference_values,selected_systems , c=range(len(selected_systems)), cmap='viridis')
769
+ ax.set_ylabel('ASR System')
770
+ ax.set_xlabel('Standard Deviation in WER across speaker age')
771
+ ax.set_title('ASR systems perfomance bias for age groups')
772
+ # add labels with difference in WER values
773
+ for i, txt in enumerate(difference_values):
774
+ ax.annotate(txt, (difference_values[i], selected_systems[i]), fontsize=5, ha='right')
775
+ st.pyplot(fig)
776
+
777
+ # READ vs CONVERSIONAL SPEECH AVERAGE WER
778
+
779
+ # Hallucinations rate per system
780
+
781
+ with interactive_comparison:
782
+
783
+
784
+
785
+ st.title("Interactive comparison of ASR Systems performance")
786
+ st.markdown(COMPARISON_INFO, unsafe_allow_html=True)
787
+
788
+ st.title("Plots for analyzing ASR Systems performance")
789
+
790
+ datasets = datasets_secret + datasets_public
791
+
792
+ dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_interactive_comparison")
793
+
794
+ # read the latest results for the selected dataset
795
+ print("Reading the latest results for dataset: ", dataset)
796
+ df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
797
+ # filter only the ref_type and norm_type we want to analyze
798
+ df_per_sample = df_per_sample_all[(df_per_sample_all["ref_type"] == ref_type) & (df_per_sample_all["norm_type"] == norm_type)]
799
+ # filter only the ref_type and norm_type we want to analyze
800
+ df_per_dataset = df_per_dataset_all[(df_per_dataset_all["ref_type"] == ref_type) & (df_per_dataset_all["norm_type"] == norm_type)]
801
+
802
+ evaluated_systems_list = df_per_sample["system"].unique()
803
+ print(evaluated_systems_list)
804
+ df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
805
+ print(df_evaluated_systems)
806
+
807
+ # read available options to analyze for specific dataset
808
+ splits = list(df_per_dataset_all['subset'].unique()) # Get the unique splits
809
+ norm_types = list(df_per_dataset_all['norm_type'].unique()) # Get the unique norm_types
810
+ ref_types = list(df_per_dataset_all['ref_type'].unique()) # Get the unique ref_types
811
+ systems = list(df_per_dataset_all['system'].unique()) # Get the unique systems
812
+ metrics = list(df_per_dataset_all.columns[7:]) # Get the unique metrics
813
+
814
+ # Select the system to display. More than 1 system can be selected.
815
+ systems_selected = st.multiselect("Select ASR Systems", systems)
816
+
817
+ # Select the metric to display
818
+ metric = st.selectbox("Select Metric", metrics, index=metrics.index('WER'))
819
+
820
+ # Select the normalization type
821
+ norm_type = st.selectbox("Select Normalization Type", norm_types, index=norm_types.index('all'))
822
+ # Select the reference type
823
+ ref_type = st.selectbox("Select Reference Type", ref_types, index=ref_types.index('orig'))
824
+
825
+ enable_labels = st.checkbox("Enable labels on radar plot", value=True)
826
+
827
+ enable_bar_chart = st.checkbox("Enable bar chart", value=True)
828
+ enable_polar_plot = st.checkbox("Enable radar plot", value=True)
829
+
830
+ orientation = st.selectbox("Select orientation", ["vertical", "horizontal"], index=0)
831
+
832
+ if enable_polar_plot:
833
+ if metric:
834
+ if systems_selected:
835
+ create_radar_plot(df_per_dataset_all, enable_labels, systems_selected, metric, norm_type, ref_type)
836
+
837
+ if enable_bar_chart:
838
+ if metric:
839
+ if systems_selected:
840
+ create_bar_chart(df_per_dataset_all, systems_selected , metric, norm_type, ref_type, orientation)
app_utils.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ from pandas.api.types import (
5
+ is_categorical_dtype,
6
+ is_datetime64_any_dtype,
7
+ is_numeric_dtype,
8
+ is_object_dtype,
9
+ )
10
+
11
+ def calculate_height_to_display(df):
12
+ # Calculate the height of the DataFrame display area
13
+ num_rows = df.shape[0]
14
+ row_height = 35 # Estimate of row height in pixels, adjust based on your layout/theme
15
+ header_height = 35 # Estimate of header height in pixels
16
+ calculated_height = num_rows * row_height + header_height
17
+
18
+ return calculated_height
19
+
20
+ def filter_dataframe(df: pd.DataFrame, target) -> pd.DataFrame:
21
+ """
22
+ Adds a UI on top of a dataframe to let viewers filter columns
23
+
24
+ Args:
25
+ df (pd.DataFrame): Original dataframe
26
+
27
+ Returns:
28
+ pd.DataFrame: Filtered dataframe
29
+ """
30
+ if(target == "datasets"):
31
+ modify = st.checkbox("Enable filters to browse ASR speech data catalog")
32
+ elif(target == "benchmarks"):
33
+ modify = st.checkbox("Enable filters to browse ASR benchmarks catalog")
34
+ else:
35
+ print("Invalid target")
36
+
37
+ if not modify:
38
+ return df
39
+
40
+ df = df.copy()
41
+
42
+ # Try to convert datetimes into a standard format (datetime, no timezone)
43
+ for col in df.columns:
44
+ if is_object_dtype(df[col]):
45
+ try:
46
+ df[col] = pd.to_datetime(df[col])
47
+ except Exception:
48
+ pass
49
+
50
+ if is_datetime64_any_dtype(df[col]):
51
+ df[col] = df[col].dt.tz_localize(None)
52
+
53
+ modification_container = st.container()
54
+
55
+ with modification_container:
56
+ to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
57
+ for column in to_filter_columns:
58
+ left, right = st.columns((1, 20))
59
+ # Treat columns with < 10 unique values as categorical
60
+ if is_categorical_dtype(df[column]) or df[column].nunique() < 10:
61
+ user_cat_input = right.multiselect(
62
+ f"Values for {column}",
63
+ df[column].unique(),
64
+ default=list(df[column].unique()),
65
+ )
66
+ df = df[df[column].isin(user_cat_input)]
67
+ elif is_numeric_dtype(df[column]):
68
+ _min = float(df[column].min())
69
+ _max = float(df[column].max())
70
+ step = (_max - _min) / 100
71
+ user_num_input = right.slider(
72
+ f"Values for {column}",
73
+ min_value=_min,
74
+ max_value=_max,
75
+ value=(_min, _max),
76
+ step=step,
77
+ )
78
+ df = df[df[column].between(*user_num_input)]
79
+ elif is_datetime64_any_dtype(df[column]):
80
+ user_date_input = right.date_input(
81
+ f"Values for {column}",
82
+ value=(
83
+ df[column].min(),
84
+ df[column].max(),
85
+ ),
86
+ )
87
+ if len(user_date_input) == 2:
88
+ user_date_input = tuple(map(pd.to_datetime, user_date_input))
89
+ start_date, end_date = user_date_input
90
+ df = df.loc[df[column].between(start_date, end_date)]
91
+ else:
92
+ user_text_input = right.text_input(
93
+ f"Substring or regex in {column}",
94
+ )
95
+ if user_text_input:
96
+ df = df[df[column].astype(str).str.contains(user_text_input)]
97
+
98
+ return df
constants.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ABOUT_INFO = "BIGOS (Benchmark Intended Grouping of Open Speech) represents the most extensive evaluation of Polish ASR (Automatic Speech Recognition) systems to date.<br> \
2
+ \This benchmark is a collaborative effort by the [AMU-CAI team](https://huggingface.co/amu-cai), aimed at providing a thorough and comprehensive assessment of various Polish ASR systems."
3
+
4
+
5
+ BIGOS_INFO = "BIGOS (Benchmark Intended Grouping of Open Speech) is the collection of freely available speech datasets curated by the [AMU-CAI team](https://huggingface.co/amu-cai). \
6
+ More details can be found [here](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2)"
7
+
8
+ PELCRA_INFO = "PELCRA for BIGOS is the subset of speech corpora created by the [PELCRA group](http://pelcra.pl/new/), curated for the BIGOS benchmark by the [AMU-CAI team](https://huggingface.co/amu-cai). \
9
+ More details can be found [here](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos)"
10
+
11
+ ANALYSIS_INFO = "Under construction"
12
+
13
+ INSPECTION_INFO = "Under construction"
14
+
15
+ COMPARISON_INFO = "Under construction"
playground-eval-dash.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ seaborn
2
+
utils.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ import os
6
+ import requests
7
+ import numpy as np
8
+ from datasets import Dataset
9
+ from huggingface_hub import hf_hub_download
10
+
11
+ def download_tsv_from_google_sheet(sheet_url):
12
+ # Modify the Google Sheet URL to export it as TSV
13
+ tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
14
+
15
+ # Send a GET request to download the TSV file
16
+ response = requests.get(tsv_url)
17
+ response.encoding = 'utf-8'
18
+
19
+ # Check if the request was successful
20
+ if response.status_code == 200:
21
+ # Read the TSV content into a pandas DataFrame
22
+ from io import StringIO
23
+ tsv_content = StringIO(response.text)
24
+ df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8')
25
+ return df
26
+ else:
27
+ print("Failed to download the TSV file.")
28
+ return None
29
+
30
+ def generate_path_to_latest_tsv(dataset_name, split, type_of_result):
31
+ fn = os.path.join("./data", dataset_name, split, "eval_results-{}-latest.tsv".format(type_of_result))
32
+ #print(fn)
33
+ return(fn)
34
+
35
+ @st.cache_data
36
+ def read_latest_results(dataset_name, split, codename_to_shortname_mapping):
37
+
38
+ # Set your Hugging Face API token as an environment variable
39
+ # Define the path to your dataset directory
40
+ repo_id = os.getenv('HF_SECRET_REPO_ID')
41
+ #"michaljunczyk/bigos-eval-results-secret"
42
+
43
+ dataset = dataset_name
44
+
45
+ dataset_path = os.path.join("leaderboard_input", dataset, split)
46
+ print(dataset_path)
47
+
48
+ fn_results_per_dataset = 'eval_results-per_dataset-latest.tsv'
49
+ fn_results_per_sample = 'eval_results-per_sample-latest.tsv'
50
+
51
+ fp_results_per_dataset_repo = os.path.join(dataset_path, fn_results_per_dataset)
52
+ print(fp_results_per_dataset_repo)
53
+ fp_results_per_sample_repo = os.path.join(dataset_path, fn_results_per_sample)
54
+
55
+ # Download the file from the Hugging Face Hub
56
+ local_fp_per_dataset = hf_hub_download(repo_id=repo_id, repo_type='dataset', filename=fp_results_per_dataset_repo, use_auth_token=os.getenv('HF_TOKEN'))
57
+ local_fp_per_sample = hf_hub_download(repo_id=repo_id, repo_type='dataset', filename=fp_results_per_sample_repo, use_auth_token=os.getenv('HF_TOKEN'))
58
+
59
+ # Read the TSV file into a pandas DataFrame
60
+ df_per_dataset = pd.read_csv(local_fp_per_dataset, delimiter='\t')
61
+ df_per_sample = pd.read_csv(local_fp_per_sample, delimiter='\t')
62
+
63
+ # Print the DataFrame
64
+ print(df_per_dataset)
65
+ print(df_per_sample)
66
+
67
+ #replace column system with Shortname
68
+ if (codename_to_shortname_mapping):
69
+ df_per_sample['system'] = df_per_sample['system'].replace(codename_to_shortname_mapping)
70
+ df_per_dataset['system'] = df_per_dataset['system'].replace(codename_to_shortname_mapping)
71
+
72
+ return df_per_sample, df_per_dataset
73
+
74
+ @st.cache_data
75
+ def retrieve_asr_systems_meta_from_the_catalog(asr_systems_list):
76
+ #print("Retrieving ASR systems metadata for systems: ", asr_systems_list)
77
+ #print("Number of systems: ", len(asr_systems_list))
78
+
79
+ #print("Reading ASR systems catalog")
80
+ asr_systems_cat_url = "https://docs.google.com/spreadsheets/d/1fVsE98Ulmt-EIEe4wx8sUdo7RLigDdAVjQxNpAJIrH8/edit#gid=681521237"
81
+ #print("Reading the catalog from: ", asr_systems_cat_url)
82
+ catalog = download_tsv_from_google_sheet(asr_systems_cat_url)
83
+ #print("ASR systems catalog read")
84
+ #print("Catalog contains information about {} ASR systems".format(len(catalog)))
85
+ ##print("Catalog columns: ", catalog.columns)
86
+ ##print("ASR systems available in the catalog: ", catalog["Codename"] )
87
+
88
+ #print("Filter only the systems we are interested in")
89
+ catalog = catalog[(catalog["Codename"].isin(asr_systems_list)) | (catalog["Shortname"].isin(asr_systems_list))]
90
+
91
+ return catalog
92
+
93
+ def basic_stats_per_dimension(df_input, metric, dimension):
94
+
95
+ #Median value
96
+ df_median = df_input.groupby(dimension)[metric].median().sort_values().round(2)
97
+
98
+ #Average value
99
+ df_avg = df_input.groupby(dimension)[metric].mean().sort_values().round(2)
100
+
101
+ #Standard deviation
102
+ df_std = df_input.groupby(dimension)[metric].std().sort_values().round(2)
103
+
104
+ # Min
105
+ df_min = df_input.groupby(dimension)[metric].min().sort_values().round(2)
106
+
107
+ # Max
108
+ df_max = df_input.groupby(dimension)[metric].max().sort_values().round(2)
109
+
110
+ # concatanate all WER statistics
111
+ df_stats = pd.concat([df_median, df_avg, df_std, df_min, df_max], axis=1)
112
+ df_stats.columns = ["med_{}".format(metric), "avg_{}".format(metric), "std_{}".format(metric), "min_{}".format(metric), "max_{}".format(metric)]
113
+
114
+ # sort by median values
115
+ df_stats = df_stats.sort_values(by="med_{}".format(metric))
116
+
117
+ return df_stats
118
+
119
+ def ser_from_per_sample_results(df_per_sample, dimension):
120
+ # group by dimension e.g dataset or sample and calculate fraction of samples with WER equal to 0
121
+ df_ser = df_per_sample.groupby(dimension)["WER"].apply(lambda x: (x != 0).mean()*100).sort_values().round(2)
122
+ # change column names
123
+ df_ser.name = "SER"
124
+ return df_ser
125
+
126
+ def get_total_audio_duration(df_per_sample):
127
+ # filter the df_per_sample dataframe to leave only unique audio recordings
128
+ df_per_sample_unique_audio = df_per_sample.drop_duplicates(subset='id')
129
+ # calculate the total size of the dataset in hours based on the list of unique audio recordings
130
+ total_duration_hours = df_per_sample_unique_audio['audio_duration'].sum() / 3600
131
+ #print(f"Total duration of the dataset: {total_duration_hours:.2f} hours")
132
+ return total_duration_hours
133
+
134
+ def extend_meta_per_sample_words_chars(df_per_sample):
135
+
136
+ # extend the results with the number of words in the reference and hypothesis
137
+ df_per_sample['ref_words'] = df_per_sample['ref'].apply(lambda x: len(x.split()))
138
+ df_per_sample['hyp_words'] = df_per_sample['hyp'].apply(lambda x: len(x.split()))
139
+
140
+ # extend the df_per_sample with the number of words per seconds (based on duration column) for reference and hypothesis
141
+ df_per_sample['ref_wps'] = df_per_sample['ref_words'] / df_per_sample['audio_duration'].round(2)
142
+ df_per_sample['hyp_wps'] = df_per_sample['hyp_words'] / df_per_sample['audio_duration'].round(2)
143
+
144
+ # extend the df_per_sample with the number of characters per seconds (based on duration column) for reference and hypothesis
145
+ df_per_sample['ref_cps'] = df_per_sample['ref'].apply(lambda x: len(x)) / df_per_sample['audio_duration'].round(2)
146
+ df_per_sample['hyp_cps'] = df_per_sample['hyp'].apply(lambda x: len(x)) / df_per_sample['audio_duration'].round(2)
147
+
148
+ # extend the df_per_sample with the number of characters per words for reference and hypothesis
149
+ df_per_sample['ref_cpw'] = df_per_sample['ref'].apply(lambda x: len(x)) / df_per_sample['ref_words'].round(2)
150
+ df_per_sample['hyp_cpw'] = df_per_sample['hyp'].apply(lambda x: len(x)) / df_per_sample['hyp_words'].round(2)
151
+
152
+ # extend metadata with number of words and characters
153
+ return df_per_sample
154
+
155
+ def filter_top_outliers(df_input, metric, max_threshold):
156
+ # filter out outliers exceeding max_threshold
157
+ df_filtered = df_input[df_input[metric] < max_threshold]
158
+ return df_filtered
159
+
160
+ def filter_bottom_outliers(df_input, metric, min_threshold):
161
+ # filter out outliers below min_threshold
162
+ df_filtered = df_input[df_input[metric] > min_threshold]
163
+ return df_filtered
164
+
165
+ def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
166
+ # Box plot for WER per dataset
167
+ plt.figure(figsize=(20, 10))
168
+
169
+ # generate box plot without outliers
170
+ sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
171
+
172
+ plt.title(title)
173
+ plt.xlabel(xlabel)
174
+ plt.ylabel(ylabel)
175
+ plt.xticks(rotation=90)
176
+ #return figure
177
+ return plt
178
+
179
+
180
+ def check_impact_of_normalization(data_in, ref_type='orig'):
181
+
182
+ # Filter the data to include only the specific reference type
183
+ data_ref_type = data_in[data_in['ref_type'] == ref_type]
184
+
185
+ data = data_ref_type.drop(columns=['system','subset', 'ref_type'])
186
+
187
+ # Calculate the average impact of each normalization type on the metrics
188
+ average_impact = data.groupby('norm_type').mean()
189
+ baseline_metrics = average_impact.loc['none']
190
+
191
+ # Calculate the difference in metrics compared to the baseline
192
+ difference_metrics = average_impact.subtract(baseline_metrics)
193
+
194
+ # Removing the baseline row for clarity
195
+ difference_metrics = difference_metrics.drop(index='none')
196
+
197
+ # Rounding the results to 2 decimal places
198
+ difference_metrics_rounded = difference_metrics.round(2)
199
+
200
+ # add column with average impact on error reduction for all metric types
201
+ difference_metrics_rounded['Average'] = difference_metrics_rounded.mean(axis=1).round(2)
202
+
203
+ # Sorting the results based on the average impact on error reduction. The lower the absolute value, the higher the impact
204
+ difference_metrics_sorted_abs = difference_metrics_rounded.sort_values(by='Average', key=abs)
205
+
206
+ # Display the resulting differences
207
+ return(difference_metrics_sorted_abs)
208
+
209
+
210
+
211
+ def calculate_wer_per_meta_category(df_per_sample, selected_systems, metric, analysis_dimension = 'speaker_gender'):
212
+ # filter out from df_per_sample rows where analysis_dimension is null
213
+ df_per_sample_dimension = df_per_sample[df_per_sample[analysis_dimension].notnull()]
214
+ #print(df_per_sample_dimension)
215
+
216
+ meta_values = df_per_sample_dimension[analysis_dimension].unique()
217
+
218
+ if (analysis_dimension == 'speaker_age'):
219
+ # sort values in the meta_values list, so the order of the values is consistent, starting from teens, twenties, thirties, fourties, fifties, sixties, seventies, eighties, nineties
220
+ # Example usage:
221
+ sorted_values = sort_age_categories(meta_values)
222
+ #print(sorted_values)
223
+ print("meta values sorted:", sorted_values)
224
+ meta_values = sorted_values
225
+
226
+ # calculate number of available systems for specific category
227
+ #print(df_per_sample_dimension)
228
+ # create table with number of samples in df_per_sample_single_system for each meta category from meta_values
229
+ df_per_sample_single_system = df_per_sample_dimension[df_per_sample['system'] == selected_systems[0]]
230
+
231
+ # select the value with the smallest number of available samples for all systems
232
+ min_samples = 0
233
+ df_available_samples_per_category_per_system = {}
234
+ for system in selected_systems:
235
+ df_per_sample_single_system = df_per_sample_dimension[df_per_sample['system'] == system]
236
+ df_available_samples_per_category_per_system[system] = df_per_sample_single_system.groupby(analysis_dimension)[metric].count().reset_index()
237
+ df_available_samples_per_category_per_system[system] = df_available_samples_per_category_per_system[system].rename(columns={metric: 'available_samples'})
238
+ # replace index with values from analysis_dimension
239
+ df_available_samples_per_category_per_system[system] = df_available_samples_per_category_per_system[system].set_index(analysis_dimension)
240
+ #print(df_available_samples_per_category_per_system[system])
241
+
242
+ min_samples_system = df_available_samples_per_category_per_system[system]['available_samples'].min()
243
+ if (min_samples_system < min_samples) or (min_samples == 0):
244
+ min_samples = min_samples_system
245
+ #print(min_samples)
246
+
247
+ # get the subset of the df_per_sample_dimension with results for all systems to analyze
248
+ df_per_sample_selected_systems = df_per_sample_dimension[df_per_sample['system'].isin(selected_systems)]
249
+ #print(df_per_sample_selected_systems)
250
+
251
+ # select equal number of samples for each system and analysis_dimension equal to the number of samples for the dimension with the smallest number of samples (min_samples)
252
+ df_per_sample_selected_systems = df_per_sample_selected_systems.groupby(['system',analysis_dimension]).apply(lambda x: x.sample(min_samples)).reset_index(drop=True)
253
+
254
+ #print(df_per_sample_selected_systems)
255
+
256
+ df_per_sample_metric_dimension = df_per_sample_selected_systems.groupby(['system', analysis_dimension])[metric].mean().round(2).reset_index()
257
+
258
+
259
+
260
+ df_per_sample_metric_dimension_pivot = df_per_sample_metric_dimension.pivot(index=analysis_dimension, columns='system', values=metric)
261
+ df_per_sample_metric_dimension_pivot = df_per_sample_metric_dimension_pivot.round(2)
262
+
263
+
264
+ # add row with the difference between the male and female metric values for values. Add "Difference" row at the end of the dataframe to the index
265
+ # calculate the difference between the smallest and largest metric values
266
+ # if there are only two values in the analysis_dimension, calculate the difference between them
267
+ if len(meta_values) == 2:
268
+ gap_metrics = ['Difference']
269
+ df_per_sample_metric_dimension_pivot.loc[gap_metrics[0]] = df_per_sample_metric_dimension_pivot.loc[meta_values[0]] - df_per_sample_metric_dimension_pivot.loc[meta_values[1]]
270
+
271
+ # if there are more than two values in the analysis_dimension, calculate the difference between the smallest and the largest value
272
+ elif len(meta_values) > 2:
273
+ gap_metrics = ['Std Dev', 'MAD', 'Range']
274
+
275
+ metrics = pd.DataFrame([])
276
+ df = df_per_sample_metric_dimension_pivot
277
+
278
+ print(df)
279
+ # calculate the standard deviation of the metric values
280
+ metrics[gap_metrics[0]] = df.std()
281
+ # calculate the mean absolute deviation of the metric values
282
+ metrics[gap_metrics[1]] = df.apply(lambda x: np.mean(np.abs(x - np.mean(x))), axis=0)
283
+
284
+ # calculate the difference between the smallest and largest metric values
285
+ metrics[gap_metrics[2]] = df.max() - df.min()
286
+
287
+ metrics_t = metrics.round(2).transpose()
288
+ print(metrics_t)
289
+
290
+ #concatante the metrics dataframe to the df_per_sample_metric_dimension_pivot
291
+ df_per_sample_metric_dimension_pivot = pd.concat([df_per_sample_metric_dimension_pivot, metrics_t], axis=0)
292
+
293
+
294
+ print(df_per_sample_metric_dimension_pivot)
295
+
296
+ # transpose the dataframe to have systems as rows
297
+ # sort by the average difference from the smallest to the largest value
298
+ df_per_sample_metric_dimension_pivot = df_per_sample_metric_dimension_pivot.transpose()
299
+ df_per_sample_metric_dimension_pivot = df_per_sample_metric_dimension_pivot.sort_values(by=gap_metrics[0], axis=0)
300
+
301
+ # add average, median and standard deviation as the last 3 rows to the dataframe
302
+ # calculate average, median, and standard deviation of the difference between the smallest and largest metric values
303
+ avg_difference = df_per_sample_metric_dimension_pivot.mean().round(2)
304
+ median_difference = df_per_sample_metric_dimension_pivot.median().round(2)
305
+ std_difference = df_per_sample_metric_dimension_pivot.std().round(2)
306
+
307
+ # add average, median, and standard deviation as the last 3 rows to the dataframe
308
+ df_per_sample_metric_dimension_pivot.loc['median'] = median_difference
309
+ df_per_sample_metric_dimension_pivot.loc['average'] = avg_difference
310
+ df_per_sample_metric_dimension_pivot.loc['std'] = std_difference
311
+
312
+ analyzed_samples_per_category = min_samples
313
+
314
+ # round all values to 2 decimal places
315
+ df_per_sample_metric_dimension_pivot = df_per_sample_metric_dimension_pivot.round(2)
316
+
317
+ # keep the order of columns as in the meta_values list
318
+ columns = list(meta_values) + gap_metrics
319
+ print(columns)
320
+ df_per_sample_metric_dimension_pivot = df_per_sample_metric_dimension_pivot[columns]
321
+
322
+ return df_per_sample_metric_dimension_pivot, df_available_samples_per_category_per_system, analyzed_samples_per_category
323
+
324
+ def sort_age_categories(meta_values):
325
+ order = ["teens", "twenties", "thirties", "fourties", "fifties", "sixties", "seventies", "eighties", "nineties"]
326
+ order_dict = {age: index for index, age in enumerate(order)}
327
+
328
+ sorted_values = sorted(meta_values, key=lambda x: order_dict.get(x, float('inf')))
329
+ return sorted_values
330
+
331
+
332
+ def calculate_wer_per_audio_feature(df_per_sample, selected_systems, audio_feature_to_analyze, metric, no_of_buckets):
333
+ # filter out results for selected systems
334
+ print(df_per_sample)
335
+
336
+ feature_values_uniq = df_per_sample[audio_feature_to_analyze].unique()
337
+ df_per_sample_selected_systems = df_per_sample[df_per_sample['system'].isin(selected_systems)]
338
+
339
+ # create buckets based on speech rate words unique values (min, max,step)
340
+ min_feature_value = round(min(feature_values_uniq), 1)
341
+ max_feature_value = round(max(feature_values_uniq), 1)
342
+ step = max_feature_value / no_of_buckets
343
+ audio_feature_buckets = [min_feature_value + i * step for i in range(no_of_buckets)]
344
+
345
+ # add column with speech_rate_words rounded to nearest bucket value.
346
+ # map audio duration to the closest bucket
347
+ df_per_sample[audio_feature_to_analyze + '_bucket'] = df_per_sample[audio_feature_to_analyze].apply(
348
+ lambda x: min(audio_feature_buckets, key=lambda y: abs(x - y)))
349
+
350
+ # calculate average WER per audio duration bucket
351
+ df_per_sample_wer_feature = df_per_sample_selected_systems.groupby(['system', audio_feature_to_analyze])[metric].mean().reset_index()
352
+ # add column with number of samples for specific audio bucket size
353
+ df_per_sample_wer_feature['number_of_samples'] = df_per_sample_selected_systems.groupby(['system', audio_feature_to_analyze])[metric].count().values
354
+
355
+ df_per_sample_wer_feature = df_per_sample_wer_feature.sort_values(by=audio_feature_to_analyze)
356
+ # round values in WER column in df_per_sample_wer to 2 decimal places
357
+ df_per_sample_wer_feature[metric].round(2)
358
+ # transform df_per_sample_wer. Use system values as columns, while audio_duration_buckets as main index
359
+ df_per_sample_wer_feature_pivot = df_per_sample_wer_feature.pivot(index=audio_feature_to_analyze, columns='system', values=metric)
360
+ df_per_sample_wer_feature_pivot = df_per_sample_wer_feature_pivot.round(2)
361
+
362
+ df_per_sample_wer_feature_pivot['number_of_samples'] = df_per_sample_wer_feature[
363
+ df_per_sample_wer_feature['system'] == selected_systems[0]].groupby(audio_feature_to_analyze)[
364
+ 'number_of_samples'].sum().values
365
+
366
+ # put number_of_samples as the first column after index
367
+ df_per_sample_wer_feature_pivot = df_per_sample_wer_feature_pivot[
368
+ ['number_of_samples'] + [col for col in df_per_sample_wer_feature_pivot.columns if col != 'number_of_samples']]
369
+
370
+ return df_per_sample_wer_feature_pivot, df_per_sample_wer_feature