Spaces:
Runtime error
Runtime error
Ceyda Cinarel
commited on
Commit
β’
8af4bd8
1
Parent(s):
3eecce7
first commit
Browse files- README.md +18 -2
- common_voice.py +164 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -1,2 +1,18 @@
|
|
1 |
-
#
|
2 |
-
Common Voice Dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Common Voice Dataset Explorer
|
2 |
+
[Common Voice Dataset](https://commonvoice.mozilla.org/en/datasets) is by Mozilla
|
3 |
+
|
4 |
+
|
5 |
+
Made during [huggingface finetuning week](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467)
|
6 |
+
|
7 |
+
# Usage
|
8 |
+
`pip install -r requirements.txt`
|
9 |
+
`streamlit run common_voice.py`
|
10 |
+
|
11 |
+
|
12 |
+
# Details
|
13 |
+
- Made using streamlit
|
14 |
+
- Using https://github.com/PablocFonseca/streamlit-aggrid for interactivity, because you can't click plots yet.
|
15 |
+
|
16 |
+
I tried to put this together as quickly as I can, so it is not perfect.
|
17 |
+
Open a PR or issue~
|
18 |
+
|
common_voice.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# os.environ['TRANSFORMERS_CACHE'] = '/mnt/hf_cache'
|
3 |
+
from datasets import load_dataset
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode, JsCode
|
10 |
+
import plotly.express as px
|
11 |
+
|
12 |
+
# pd.options.plotting.backend = "plotly"
|
13 |
+
|
14 |
+
|
15 |
+
# TODO show average sentence length
|
16 |
+
# TODO show audio stats
|
17 |
+
# TODO speed better caching
|
18 |
+
|
19 |
+
# hide_menu_style = """
|
20 |
+
# <style>
|
21 |
+
# #MainMenu {visibility: hidden;}
|
22 |
+
# </style>
|
23 |
+
# """
|
24 |
+
# st.markdown(hide_menu_style, unsafe_allow_html=True)
|
25 |
+
|
26 |
+
@st.cache(suppress_st_warning=True)
|
27 |
+
def cache_graph(dat,y,x,color=None):
|
28 |
+
#I feel like this doesn't work correctly
|
29 |
+
return px.bar(dat,y=y,x=x,color=color)
|
30 |
+
|
31 |
+
@st.cache(suppress_st_warning=True)
|
32 |
+
def cache_dataset(language,split=None):
|
33 |
+
dat=load_dataset("common_voice",language,split=split)
|
34 |
+
if split:
|
35 |
+
return pd.DataFrame(dat)
|
36 |
+
else:
|
37 |
+
return dat
|
38 |
+
|
39 |
+
|
40 |
+
language_codes=['ab', 'ar', 'as', 'br', 'ca', 'cnh', 'cs', 'cv', 'cy', 'de', 'dv', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'hi', 'hsb', 'hu', 'ia', 'id', 'it', 'ja', 'ka', 'kab', 'ky', 'lg', 'lt', 'lv', 'mn', 'mt', 'nl', 'or', 'pa-IN', 'pl', 'pt', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sl', 'sv-SE', 'ta', 'th', 'tr', 'tt', 'uk', 'vi', 'vot', 'zh-CN', 'zh-HK', 'zh-TW']
|
41 |
+
|
42 |
+
|
43 |
+
return_mode_value = DataReturnMode.AS_INPUT
|
44 |
+
update_mode_value = GridUpdateMode.SELECTION_CHANGED
|
45 |
+
|
46 |
+
def configure_grid_stat(df):
|
47 |
+
|
48 |
+
gb = GridOptionsBuilder.from_dataframe(df)
|
49 |
+
gb.configure_default_column(editable=False)
|
50 |
+
# gb.configure_side_bar()
|
51 |
+
|
52 |
+
gb.configure_selection("multiple", use_checkbox=False,rowMultiSelectWithClick=True, suppressRowDeselection=False)
|
53 |
+
|
54 |
+
# gb.configure_grid_options(domLayout='autoHeight')
|
55 |
+
gb.configure_grid_options(domLayout='normal')
|
56 |
+
gridOptions = gb.build()
|
57 |
+
return gridOptions
|
58 |
+
|
59 |
+
def configure_grid_detail(df):
|
60 |
+
|
61 |
+
gb = GridOptionsBuilder.from_dataframe(df)
|
62 |
+
|
63 |
+
# gb.configure_default_column(groupable=False, value=True, enableRowGroup=False, editable=False)
|
64 |
+
# gb.configure_side_bar()
|
65 |
+
gb.configure_default_column(editable=False)
|
66 |
+
gb.configure_column("sentence",initialPinned="left")
|
67 |
+
# gb.configure_column("client_id",hide=True)
|
68 |
+
gb.configure_selection("single", use_checkbox=False)
|
69 |
+
gb.configure_pagination(paginationAutoPageSize=True)
|
70 |
+
gb.configure_grid_options(domLayout='normal')
|
71 |
+
|
72 |
+
gridOptions = gb.build()
|
73 |
+
return gridOptions
|
74 |
+
|
75 |
+
st.sidebar.markdown("# Common Voice Explorer")
|
76 |
+
st.sidebar.markdown('[Common Voice](https://commonvoice.mozilla.org/en/datasets) dataset by Mozilla')
|
77 |
+
|
78 |
+
language=st.sidebar.selectbox("Language code:",language_codes)
|
79 |
+
placeholder = st.sidebar.empty()
|
80 |
+
placeholder.markdown('Loading for the first time may take a while...downloading dataset :hourglass_flowing_sand:')
|
81 |
+
dat = cache_dataset(language, split=None)
|
82 |
+
split=placeholder.multiselect("Split:",list(dat.keys()),default="train")
|
83 |
+
if len(split)>1:
|
84 |
+
split="+".join(split)
|
85 |
+
elif split:
|
86 |
+
split=split[0]
|
87 |
+
|
88 |
+
split_stat=pd.DataFrame(dat.num_rows.items(), columns=['split', 'num_rows'])
|
89 |
+
fig = cache_graph(split_stat,y='split',x='num_rows')
|
90 |
+
st.sidebar.plotly_chart(fig, use_container_width=True,config=dict(displayModeBar=False))
|
91 |
+
st.sidebar.markdown("Dataset Explorer by [Ceyda Cinarel](https://github.com/cceyda/common-voice-explorer)")
|
92 |
+
|
93 |
+
chart_data = cache_dataset(language, split=split)
|
94 |
+
|
95 |
+
|
96 |
+
cols=["accent", "age", "down_votes", "gender", "locale", "segment", "up_votes"]
|
97 |
+
cols_other=["sentence","path","client_id"]
|
98 |
+
|
99 |
+
|
100 |
+
st.markdown("# Stats")
|
101 |
+
st.markdown("## Distribution")
|
102 |
+
# st.markdown("x axis:first selection color:second selection ")
|
103 |
+
|
104 |
+
attributes=st.multiselect("Colums:",cols)
|
105 |
+
|
106 |
+
if attributes:
|
107 |
+
|
108 |
+
# chart_data = chart_data.replace(r'^\s+$', "UNK", regex=True)
|
109 |
+
stats=chart_data.groupby(attributes).size().reset_index(name='counts')
|
110 |
+
|
111 |
+
col1, col2 = st.beta_columns(2)
|
112 |
+
|
113 |
+
if len(attributes)>1:
|
114 |
+
color=attributes[1]
|
115 |
+
else:
|
116 |
+
color=None
|
117 |
+
fig = cache_graph(stats, x=attributes[0], y='counts',color=color)
|
118 |
+
|
119 |
+
col1.plotly_chart(fig, use_container_width=True)
|
120 |
+
|
121 |
+
gridOptions=configure_grid_stat(stats)
|
122 |
+
with col2:
|
123 |
+
selection=AgGrid(stats,
|
124 |
+
data_return_mode=return_mode_value,
|
125 |
+
update_mode=update_mode_value,
|
126 |
+
fit_columns_on_grid_load=True,
|
127 |
+
# allow_unsafe_jscode=True,
|
128 |
+
gridOptions=gridOptions
|
129 |
+
)
|
130 |
+
st.write(":point_up: Click on the table to see details")
|
131 |
+
|
132 |
+
condition=False
|
133 |
+
if selection['selected_rows']:
|
134 |
+
for r in selection['selected_rows']:
|
135 |
+
|
136 |
+
del r["counts"]
|
137 |
+
# st.write(r)
|
138 |
+
sub_cond=True
|
139 |
+
for a in r.keys():
|
140 |
+
sub_cond&=(chart_data[a]==r[a])
|
141 |
+
|
142 |
+
condition|=sub_cond
|
143 |
+
|
144 |
+
detail_frame=chart_data[condition]
|
145 |
+
gridOptions=configure_grid_detail(detail_frame)
|
146 |
+
|
147 |
+
detail_selection=AgGrid(detail_frame,
|
148 |
+
data_return_mode=return_mode_value,
|
149 |
+
update_mode=update_mode_value,
|
150 |
+
# allow_unsafe_jscode=True,
|
151 |
+
gridOptions=gridOptions
|
152 |
+
)
|
153 |
+
if detail_selection['selected_rows']:
|
154 |
+
|
155 |
+
example=detail_selection['selected_rows'][0]
|
156 |
+
st.audio(example["path"])
|
157 |
+
st.write(example["sentence"])
|
158 |
+
else:
|
159 |
+
st.write(":point_up: Click on the table to listen")
|
160 |
+
|
161 |
+
else:
|
162 |
+
st.write(":point_up: Select a column or two")
|
163 |
+
|
164 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
streamlit
|
5 |
+
streamlit-aggrid
|
6 |
+
plotly
|