Yacine Jernite commited on
Commit
f4b8e6e
β€’
1 Parent(s): c500e3c

can only select available splits

Browse files
cache_dir/has_cache.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e7d89146f736ca9852dd82abaa7d29225499d53ca16f7714cfa576915e0a7d7
3
+ size 3584
data_measurements/streamlit_utils.py CHANGED
@@ -14,6 +14,7 @@
14
 
15
  import statistics
16
 
 
17
  import pandas as pd
18
  import seaborn as sns
19
  import streamlit as st
@@ -22,6 +23,8 @@ from st_aggrid import AgGrid, GridOptionsBuilder
22
  from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
23
  st.set_option('deprecation.showPyplotGlobalUse', False)
24
 
 
 
25
  def sidebar_header():
26
  st.sidebar.markdown(
27
  """
@@ -29,16 +32,17 @@ def sidebar_header():
29
  Right now this has a few pre-loaded datasets for which you can:
30
  - view some general statistics about the text vocabulary, lengths, labels
31
  - explore some distributional statistics to assess properties of the language
32
- - view some comparison statistics and overview of the text distribution
33
-
34
- The tool is in development, and will keep growing in utility and functionality πŸ€—πŸš§
35
  """,
36
  unsafe_allow_html=True,
37
  )
38
 
39
 
40
  def sidebar_selection(ds_name_to_dict, column_id):
41
- ds_names = list(ds_name_to_dict.keys())
 
42
  with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
43
  # choose a dataset to analyze
44
  ds_name = st.selectbox(
@@ -52,6 +56,7 @@ def sidebar_selection(ds_name_to_dict, column_id):
52
  config_names = ['en','en.noblocklist','realnewslike']
53
  else:
54
  config_names = list(ds_configs.keys())
 
55
  config_name = st.selectbox(
56
  f"Choose configuration{column_id}:",
57
  config_names,
@@ -60,7 +65,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
60
  # choose a subset of num_examples
61
  # TODO: Handling for multiple text features
62
  ds_config = ds_configs[config_name]
63
- text_features = ds_config[HF_FEATURE_FIELD]["string"]
 
64
  # TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
65
  text_field = st.selectbox(
66
  f"Which text feature from the{column_id} dataset would you like to analyze?",
@@ -69,7 +75,8 @@ def sidebar_selection(ds_name_to_dict, column_id):
69
  else [tp for tp in text_features if tp[0] != "id"],
70
  )
71
  # Choose a split and dataset size
72
- avail_splits = list(ds_config["splits"].keys())
 
73
  # 12.Nov note: Removing "test" because those should not be examined
74
  # without discussion of pros and cons, which we haven't done yet.
75
  if "test" in avail_splits:
 
14
 
15
  import statistics
16
 
17
+ import json
18
  import pandas as pd
19
  import seaborn as sns
20
  import streamlit as st
 
23
  from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
24
  st.set_option('deprecation.showPyplotGlobalUse', False)
25
 
26
+ _HAS_CACHE = json.load(open("cache_dir/has_cache.json"))
27
+
28
  def sidebar_header():
29
  st.sidebar.markdown(
30
  """
 
32
  Right now this has a few pre-loaded datasets for which you can:
33
  - view some general statistics about the text vocabulary, lengths, labels
34
  - explore some distributional statistics to assess properties of the language
35
+ - view some comparison statistics and overview of the text distribution
36
+
37
+ The tool is in development, and will keep growing in utility and functionality πŸ€—πŸš§
38
  """,
39
  unsafe_allow_html=True,
40
  )
41
 
42
 
43
  def sidebar_selection(ds_name_to_dict, column_id):
44
+ # ds_names = list(ds_name_to_dict.keys())
45
+ ds_names = list(_HAS_CACHE.keys())
46
  with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
47
  # choose a dataset to analyze
48
  ds_name = st.selectbox(
 
56
  config_names = ['en','en.noblocklist','realnewslike']
57
  else:
58
  config_names = list(ds_configs.keys())
59
+ config_names = list(_HAS_CACHE[ds_name].keys())
60
  config_name = st.selectbox(
61
  f"Choose configuration{column_id}:",
62
  config_names,
 
65
  # choose a subset of num_examples
66
  # TODO: Handling for multiple text features
67
  ds_config = ds_configs[config_name]
68
+ # text_features = ds_config[HF_FEATURE_FIELD]["string"]
69
+ text_features = [tuple(text_field.split('-')) for text_field in _HAS_CACHE[ds_name][config_name]]
70
  # TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
71
  text_field = st.selectbox(
72
  f"Which text feature from the{column_id} dataset would you like to analyze?",
 
75
  else [tp for tp in text_features if tp[0] != "id"],
76
  )
77
  # Choose a split and dataset size
78
+ # avail_splits = list(ds_config["splits"].keys())
79
+ avail_splits = list(_HAS_CACHE[ds_name][config_name]['-'.join(text_field)].keys())
80
  # 12.Nov note: Removing "test" because those should not be examined
81
  # without discussion of pros and cons, which we haven't done yet.
82
  if "test" in avail_splits: