Spaces:
Build error
Build error
Yacine Jernite
commited on
Commit
β’
f4b8e6e
1
Parent(s):
c500e3c
can only select available splits
Browse files
cache_dir/has_cache.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e7d89146f736ca9852dd82abaa7d29225499d53ca16f7714cfa576915e0a7d7
|
3 |
+
size 3584
|
data_measurements/streamlit_utils.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14 |
|
15 |
import statistics
|
16 |
|
|
|
17 |
import pandas as pd
|
18 |
import seaborn as sns
|
19 |
import streamlit as st
|
@@ -22,6 +23,8 @@ from st_aggrid import AgGrid, GridOptionsBuilder
|
|
22 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
23 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
24 |
|
|
|
|
|
25 |
def sidebar_header():
|
26 |
st.sidebar.markdown(
|
27 |
"""
|
@@ -29,16 +32,17 @@ def sidebar_header():
|
|
29 |
Right now this has a few pre-loaded datasets for which you can:
|
30 |
- view some general statistics about the text vocabulary, lengths, labels
|
31 |
- explore some distributional statistics to assess properties of the language
|
32 |
-
- view some comparison statistics and overview of the text distribution
|
33 |
-
|
34 |
-
The tool is in development, and will keep growing in utility and functionality π€π§
|
35 |
""",
|
36 |
unsafe_allow_html=True,
|
37 |
)
|
38 |
|
39 |
|
40 |
def sidebar_selection(ds_name_to_dict, column_id):
|
41 |
-
ds_names = list(ds_name_to_dict.keys())
|
|
|
42 |
with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
|
43 |
# choose a dataset to analyze
|
44 |
ds_name = st.selectbox(
|
@@ -52,6 +56,7 @@ def sidebar_selection(ds_name_to_dict, column_id):
|
|
52 |
config_names = ['en','en.noblocklist','realnewslike']
|
53 |
else:
|
54 |
config_names = list(ds_configs.keys())
|
|
|
55 |
config_name = st.selectbox(
|
56 |
f"Choose configuration{column_id}:",
|
57 |
config_names,
|
@@ -60,7 +65,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
|
|
60 |
# choose a subset of num_examples
|
61 |
# TODO: Handling for multiple text features
|
62 |
ds_config = ds_configs[config_name]
|
63 |
-
text_features = ds_config[HF_FEATURE_FIELD]["string"]
|
|
|
64 |
# TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
|
65 |
text_field = st.selectbox(
|
66 |
f"Which text feature from the{column_id} dataset would you like to analyze?",
|
@@ -69,7 +75,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
|
|
69 |
else [tp for tp in text_features if tp[0] != "id"],
|
70 |
)
|
71 |
# Choose a split and dataset size
|
72 |
-
avail_splits = list(ds_config["splits"].keys())
|
|
|
73 |
# 12.Nov note: Removing "test" because those should not be examined
|
74 |
# without discussion of pros and cons, which we haven't done yet.
|
75 |
if "test" in avail_splits:
|
|
|
14 |
|
15 |
import statistics
|
16 |
|
17 |
+
import json
|
18 |
import pandas as pd
|
19 |
import seaborn as sns
|
20 |
import streamlit as st
|
|
|
23 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
24 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
25 |
|
26 |
+
_HAS_CACHE = json.load(open("cache_dir/has_cache.json"))
|
27 |
+
|
28 |
def sidebar_header():
|
29 |
st.sidebar.markdown(
|
30 |
"""
|
|
|
32 |
Right now this has a few pre-loaded datasets for which you can:
|
33 |
- view some general statistics about the text vocabulary, lengths, labels
|
34 |
- explore some distributional statistics to assess properties of the language
|
35 |
+
- view some comparison statistics and overview of the text distribution
|
36 |
+
|
37 |
+
The tool is in development, and will keep growing in utility and functionality π€π§
|
38 |
""",
|
39 |
unsafe_allow_html=True,
|
40 |
)
|
41 |
|
42 |
|
43 |
def sidebar_selection(ds_name_to_dict, column_id):
|
44 |
+
# ds_names = list(ds_name_to_dict.keys())
|
45 |
+
ds_names = list(_HAS_CACHE.keys())
|
46 |
with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
|
47 |
# choose a dataset to analyze
|
48 |
ds_name = st.selectbox(
|
|
|
56 |
config_names = ['en','en.noblocklist','realnewslike']
|
57 |
else:
|
58 |
config_names = list(ds_configs.keys())
|
59 |
+
config_names = list(_HAS_CACHE[ds_name].keys())
|
60 |
config_name = st.selectbox(
|
61 |
f"Choose configuration{column_id}:",
|
62 |
config_names,
|
|
|
65 |
# choose a subset of num_examples
|
66 |
# TODO: Handling for multiple text features
|
67 |
ds_config = ds_configs[config_name]
|
68 |
+
# text_features = ds_config[HF_FEATURE_FIELD]["string"]
|
69 |
+
text_features = [tuple(text_field.split('-')) for text_field in _HAS_CACHE[ds_name][config_name]]
|
70 |
# TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
|
71 |
text_field = st.selectbox(
|
72 |
f"Which text feature from the{column_id} dataset would you like to analyze?",
|
|
|
75 |
else [tp for tp in text_features if tp[0] != "id"],
|
76 |
)
|
77 |
# Choose a split and dataset size
|
78 |
+
# avail_splits = list(ds_config["splits"].keys())
|
79 |
+
avail_splits = list(_HAS_CACHE[ds_name][config_name]['-'.join(text_field)].keys())
|
80 |
# 12.Nov note: Removing "test" because those should not be examined
|
81 |
# without discussion of pros and cons, which we haven't done yet.
|
82 |
if "test" in avail_splits:
|