Spaces:

ceyda
/

common-voice-explorer

Runtime error

App Files Files Community

Ceyda Cinarel commited on Mar 26, 2021

Commit

8af4bd8

•

1 Parent(s): 3eecce7

first commit

Browse files

Files changed (3) hide show

README.md +18 -2
common_voice.py +164 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,2 +1,18 @@
-# common-voice-explorer
-Common Voice Dataset explorer

+# Common Voice Dataset Explorer
+[Common Voice Dataset](https://commonvoice.mozilla.org/en/datasets) is by Mozilla
+Made during [huggingface finetuning week](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467)
+# Usage
+`pip install -r requirements.txt`
+`streamlit run common_voice.py`
+# Details
+- Made using streamlit
+- Using https://github.com/PablocFonseca/streamlit-aggrid for interactivity, because you can't click plots yet.
+I tried to put this together as quickly as I can, so it is not perfect.
+Open a PR or issue~

common_voice.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+# os.environ['TRANSFORMERS_CACHE'] = '/mnt/hf_cache'
+from datasets import load_dataset
+import streamlit as st
+import pandas as pd
+import numpy as np
+from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode, JsCode
+import plotly.express as px
+# pd.options.plotting.backend = "plotly"
+# TODO show average sentence length
+# TODO show audio stats
+# TODO speed better caching
+# hide_menu_style = """
+#         <style>
+#         #MainMenu {visibility: hidden;}
+#         </style>
+#         """
+# st.markdown(hide_menu_style, unsafe_allow_html=True)
+@st.cache(suppress_st_warning=True)
+def cache_graph(dat,y,x,color=None):
+    #I feel like this doesn't work correctly
+    return px.bar(dat,y=y,x=x,color=color)
+@st.cache(suppress_st_warning=True)
+def cache_dataset(language,split=None):
+    dat=load_dataset("common_voice",language,split=split)
+    if split:
+        return pd.DataFrame(dat)
+    else:
+        return dat
+language_codes=['ab', 'ar', 'as', 'br', 'ca', 'cnh', 'cs', 'cv', 'cy', 'de', 'dv', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'hi', 'hsb', 'hu', 'ia', 'id', 'it', 'ja', 'ka', 'kab', 'ky', 'lg', 'lt', 'lv', 'mn', 'mt', 'nl', 'or', 'pa-IN', 'pl', 'pt', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sl', 'sv-SE', 'ta', 'th', 'tr', 'tt', 'uk', 'vi', 'vot', 'zh-CN', 'zh-HK', 'zh-TW']
+return_mode_value = DataReturnMode.AS_INPUT
+update_mode_value = GridUpdateMode.SELECTION_CHANGED
+def configure_grid_stat(df):
+    gb = GridOptionsBuilder.from_dataframe(df)
+    gb.configure_default_column(editable=False)
+    # gb.configure_side_bar()
+    gb.configure_selection("multiple", use_checkbox=False,rowMultiSelectWithClick=True, suppressRowDeselection=False)
+#     gb.configure_grid_options(domLayout='autoHeight')
+    gb.configure_grid_options(domLayout='normal')
+    gridOptions = gb.build()
+    return gridOptions
+def configure_grid_detail(df):
+    gb = GridOptionsBuilder.from_dataframe(df)
+    # gb.configure_default_column(groupable=False, value=True, enableRowGroup=False, editable=False)
+    # gb.configure_side_bar()
+    gb.configure_default_column(editable=False)
+    gb.configure_column("sentence",initialPinned="left")
+#     gb.configure_column("client_id",hide=True)
+    gb.configure_selection("single", use_checkbox=False)
+    gb.configure_pagination(paginationAutoPageSize=True)
+    gb.configure_grid_options(domLayout='normal')
+    gridOptions = gb.build()
+    return gridOptions
+st.sidebar.markdown("# Common Voice Explorer")
+st.sidebar.markdown('[Common Voice](https://commonvoice.mozilla.org/en/datasets) dataset by Mozilla')
+language=st.sidebar.selectbox("Language code:",language_codes)
+placeholder = st.sidebar.empty()
+placeholder.markdown('Loading for the first time may take a while...downloading dataset :hourglass_flowing_sand:')
+dat = cache_dataset(language, split=None)
+split=placeholder.multiselect("Split:",list(dat.keys()),default="train")
+if len(split)>1:
+    split="+".join(split)
+elif split:
+    split=split[0]
+split_stat=pd.DataFrame(dat.num_rows.items(), columns=['split', 'num_rows'])
+fig =  cache_graph(split_stat,y='split',x='num_rows')
+st.sidebar.plotly_chart(fig, use_container_width=True,config=dict(displayModeBar=False))
+st.sidebar.markdown("Dataset Explorer by [Ceyda Cinarel](https://github.com/cceyda/common-voice-explorer)")
+chart_data = cache_dataset(language, split=split)
+cols=["accent", "age", "down_votes", "gender", "locale", "segment", "up_votes"]
+cols_other=["sentence","path","client_id"]
+st.markdown("# Stats")
+st.markdown("## Distribution")
+# st.markdown("x axis:first selection  color:second selection ")
+attributes=st.multiselect("Colums:",cols)
+if attributes:
+    # chart_data = chart_data.replace(r'^\s+$', "UNK", regex=True)
+    stats=chart_data.groupby(attributes).size().reset_index(name='counts')
+    col1, col2 = st.beta_columns(2)
+    if len(attributes)>1:
+        color=attributes[1]
+    else:
+        color=None
+    fig = cache_graph(stats, x=attributes[0], y='counts',color=color)
+    col1.plotly_chart(fig, use_container_width=True)
+    gridOptions=configure_grid_stat(stats)
+    with col2:
+        selection=AgGrid(stats,
+                        data_return_mode=return_mode_value,
+                          update_mode=update_mode_value,
+                        fit_columns_on_grid_load=True,
+        #                  allow_unsafe_jscode=True,
+                         gridOptions=gridOptions
+                            )
+        st.write(":point_up: Click on the table to see details")
+    condition=False
+    if selection['selected_rows']:
+        for r in selection['selected_rows']:
+            del r["counts"]
+#             st.write(r)
+            sub_cond=True
+            for a in r.keys():
+                sub_cond&=(chart_data[a]==r[a])
+            condition|=sub_cond
+        detail_frame=chart_data[condition]
+        gridOptions=configure_grid_detail(detail_frame)
+        detail_selection=AgGrid(detail_frame,
+                            data_return_mode=return_mode_value,
+                              update_mode=update_mode_value,
+            #                  allow_unsafe_jscode=True,
+                             gridOptions=gridOptions
+                                )
+        if detail_selection['selected_rows']:
+            example=detail_selection['selected_rows'][0]
+            st.audio(example["path"])
+            st.write(example["sentence"])
+        else:
+            st.write(":point_up: Click on the table to listen")
+else:
+    st.write(":point_up: Select a column or two")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+datasets
+pandas
+numpy
+streamlit
+streamlit-aggrid
+plotly