Ceyda Cinarel commited on
Commit
8af4bd8
β€’
1 Parent(s): 3eecce7

first commit

Browse files
Files changed (3) hide show
  1. README.md +18 -2
  2. common_voice.py +164 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,2 +1,18 @@
1
- # common-voice-explorer
2
- Common Voice Dataset explorer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Common Voice Dataset Explorer
2
+ [Common Voice Dataset](https://commonvoice.mozilla.org/en/datasets) is by Mozilla
3
+
4
+
5
+ Made during [huggingface finetuning week](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467)
6
+
7
+ # Usage
8
+ `pip install -r requirements.txt`
9
+ `streamlit run common_voice.py`
10
+
11
+
12
+ # Details
13
+ - Made using streamlit
14
+ - Using https://github.com/PablocFonseca/streamlit-aggrid for interactivity, because you can't click plots yet.
15
+
16
+ I tried to put this together as quickly as I can, so it is not perfect.
17
+ Open a PR or issue~
18
+
common_voice.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # os.environ['TRANSFORMERS_CACHE'] = '/mnt/hf_cache'
3
+ from datasets import load_dataset
4
+
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import numpy as np
8
+
9
+ from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode, JsCode
10
+ import plotly.express as px
11
+
12
+ # pd.options.plotting.backend = "plotly"
13
+
14
+
15
+ # TODO show average sentence length
16
+ # TODO show audio stats
17
+ # TODO speed better caching
18
+
19
+ # hide_menu_style = """
20
+ # <style>
21
+ # #MainMenu {visibility: hidden;}
22
+ # </style>
23
+ # """
24
+ # st.markdown(hide_menu_style, unsafe_allow_html=True)
25
+
26
+ @st.cache(suppress_st_warning=True)
27
+ def cache_graph(dat,y,x,color=None):
28
+ #I feel like this doesn't work correctly
29
+ return px.bar(dat,y=y,x=x,color=color)
30
+
31
+ @st.cache(suppress_st_warning=True)
32
+ def cache_dataset(language,split=None):
33
+ dat=load_dataset("common_voice",language,split=split)
34
+ if split:
35
+ return pd.DataFrame(dat)
36
+ else:
37
+ return dat
38
+
39
+
40
+ language_codes=['ab', 'ar', 'as', 'br', 'ca', 'cnh', 'cs', 'cv', 'cy', 'de', 'dv', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'hi', 'hsb', 'hu', 'ia', 'id', 'it', 'ja', 'ka', 'kab', 'ky', 'lg', 'lt', 'lv', 'mn', 'mt', 'nl', 'or', 'pa-IN', 'pl', 'pt', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sl', 'sv-SE', 'ta', 'th', 'tr', 'tt', 'uk', 'vi', 'vot', 'zh-CN', 'zh-HK', 'zh-TW']
41
+
42
+
43
+ return_mode_value = DataReturnMode.AS_INPUT
44
+ update_mode_value = GridUpdateMode.SELECTION_CHANGED
45
+
46
+ def configure_grid_stat(df):
47
+
48
+ gb = GridOptionsBuilder.from_dataframe(df)
49
+ gb.configure_default_column(editable=False)
50
+ # gb.configure_side_bar()
51
+
52
+ gb.configure_selection("multiple", use_checkbox=False,rowMultiSelectWithClick=True, suppressRowDeselection=False)
53
+
54
+ # gb.configure_grid_options(domLayout='autoHeight')
55
+ gb.configure_grid_options(domLayout='normal')
56
+ gridOptions = gb.build()
57
+ return gridOptions
58
+
59
+ def configure_grid_detail(df):
60
+
61
+ gb = GridOptionsBuilder.from_dataframe(df)
62
+
63
+ # gb.configure_default_column(groupable=False, value=True, enableRowGroup=False, editable=False)
64
+ # gb.configure_side_bar()
65
+ gb.configure_default_column(editable=False)
66
+ gb.configure_column("sentence",initialPinned="left")
67
+ # gb.configure_column("client_id",hide=True)
68
+ gb.configure_selection("single", use_checkbox=False)
69
+ gb.configure_pagination(paginationAutoPageSize=True)
70
+ gb.configure_grid_options(domLayout='normal')
71
+
72
+ gridOptions = gb.build()
73
+ return gridOptions
74
+
75
+ st.sidebar.markdown("# Common Voice Explorer")
76
+ st.sidebar.markdown('[Common Voice](https://commonvoice.mozilla.org/en/datasets) dataset by Mozilla')
77
+
78
+ language=st.sidebar.selectbox("Language code:",language_codes)
79
+ placeholder = st.sidebar.empty()
80
+ placeholder.markdown('Loading for the first time may take a while...downloading dataset :hourglass_flowing_sand:')
81
+ dat = cache_dataset(language, split=None)
82
+ split=placeholder.multiselect("Split:",list(dat.keys()),default="train")
83
+ if len(split)>1:
84
+ split="+".join(split)
85
+ elif split:
86
+ split=split[0]
87
+
88
+ split_stat=pd.DataFrame(dat.num_rows.items(), columns=['split', 'num_rows'])
89
+ fig = cache_graph(split_stat,y='split',x='num_rows')
90
+ st.sidebar.plotly_chart(fig, use_container_width=True,config=dict(displayModeBar=False))
91
+ st.sidebar.markdown("Dataset Explorer by [Ceyda Cinarel](https://github.com/cceyda/common-voice-explorer)")
92
+
93
+ chart_data = cache_dataset(language, split=split)
94
+
95
+
96
+ cols=["accent", "age", "down_votes", "gender", "locale", "segment", "up_votes"]
97
+ cols_other=["sentence","path","client_id"]
98
+
99
+
100
+ st.markdown("# Stats")
101
+ st.markdown("## Distribution")
102
+ # st.markdown("x axis:first selection color:second selection ")
103
+
104
+ attributes=st.multiselect("Colums:",cols)
105
+
106
+ if attributes:
107
+
108
+ # chart_data = chart_data.replace(r'^\s+$', "UNK", regex=True)
109
+ stats=chart_data.groupby(attributes).size().reset_index(name='counts')
110
+
111
+ col1, col2 = st.beta_columns(2)
112
+
113
+ if len(attributes)>1:
114
+ color=attributes[1]
115
+ else:
116
+ color=None
117
+ fig = cache_graph(stats, x=attributes[0], y='counts',color=color)
118
+
119
+ col1.plotly_chart(fig, use_container_width=True)
120
+
121
+ gridOptions=configure_grid_stat(stats)
122
+ with col2:
123
+ selection=AgGrid(stats,
124
+ data_return_mode=return_mode_value,
125
+ update_mode=update_mode_value,
126
+ fit_columns_on_grid_load=True,
127
+ # allow_unsafe_jscode=True,
128
+ gridOptions=gridOptions
129
+ )
130
+ st.write(":point_up: Click on the table to see details")
131
+
132
+ condition=False
133
+ if selection['selected_rows']:
134
+ for r in selection['selected_rows']:
135
+
136
+ del r["counts"]
137
+ # st.write(r)
138
+ sub_cond=True
139
+ for a in r.keys():
140
+ sub_cond&=(chart_data[a]==r[a])
141
+
142
+ condition|=sub_cond
143
+
144
+ detail_frame=chart_data[condition]
145
+ gridOptions=configure_grid_detail(detail_frame)
146
+
147
+ detail_selection=AgGrid(detail_frame,
148
+ data_return_mode=return_mode_value,
149
+ update_mode=update_mode_value,
150
+ # allow_unsafe_jscode=True,
151
+ gridOptions=gridOptions
152
+ )
153
+ if detail_selection['selected_rows']:
154
+
155
+ example=detail_selection['selected_rows'][0]
156
+ st.audio(example["path"])
157
+ st.write(example["sentence"])
158
+ else:
159
+ st.write(":point_up: Click on the table to listen")
160
+
161
+ else:
162
+ st.write(":point_up: Select a column or two")
163
+
164
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ datasets
2
+ pandas
3
+ numpy
4
+ streamlit
5
+ streamlit-aggrid
6
+ plotly