Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
66693d5
1
Parent(s):
e1f2cc3
Removing need to keep around base dset for the header widget; now just saving what is shown -- the first n lines of the base dataset -- as a json, and loading if it's cached.
Browse files
data_measurements/dataset_statistics.py
CHANGED
@@ -185,6 +185,7 @@ class DatasetStatisticsCacheClass:
|
|
185 |
self.dset = None # original dataset
|
186 |
# HF dataset with all of the self.text_field instances in self.dset
|
187 |
self.text_dset = None
|
|
|
188 |
# HF dataset with text embeddings in the same order as self.text_dset
|
189 |
self.embeddings_dset = None
|
190 |
# HF dataset with all of the self.label_field instances in self.dset
|
@@ -254,6 +255,7 @@ class DatasetStatisticsCacheClass:
|
|
254 |
logs.warning("Creating cache directory %s." % self.cache_path)
|
255 |
mkdir(self.cache_path)
|
256 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
|
|
257 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
258 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
259 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
@@ -283,10 +285,6 @@ class DatasetStatisticsCacheClass:
|
|
283 |
use_streaming=True,
|
284 |
)
|
285 |
|
286 |
-
def get_dataset_peek(self):
|
287 |
-
self.get_base_dataset()
|
288 |
-
return self.dset[:100]
|
289 |
-
|
290 |
def load_or_prepare_general_stats(self, use_cache=False, save=True):
|
291 |
"""
|
292 |
Content for expander_general_stats widget.
|
@@ -462,7 +460,19 @@ class DatasetStatisticsCacheClass:
|
|
462 |
self.load_or_prepare_text_dset(use_cache, save)
|
463 |
logs.info("Doing tokenized dataframe")
|
464 |
self.load_or_prepare_tokenized_df(use_cache, save)
|
|
|
|
|
465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
|
467 |
def load_or_prepare_tokenized_df(self, use_cache, save):
|
468 |
if (use_cache and exists(self.tokenized_df_fid)):
|
@@ -483,20 +493,23 @@ class DatasetStatisticsCacheClass:
|
|
483 |
logs.info(self.text_dset)
|
484 |
# ...Or load it from the server and store it anew
|
485 |
else:
|
486 |
-
self.
|
487 |
-
# extract all text instances
|
488 |
-
self.text_dset = self.dset.map(
|
489 |
-
lambda examples: extract_field(
|
490 |
-
examples, self.text_field, OUR_TEXT_FIELD
|
491 |
-
),
|
492 |
-
batched=True,
|
493 |
-
remove_columns=list(self.dset.features),
|
494 |
-
)
|
495 |
if save:
|
496 |
# save extracted text instances
|
497 |
logs.warning("Saving dataset to disk")
|
498 |
self.text_dset.save_to_disk(self.text_dset_fid)
|
499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
def do_tokenization(self):
|
501 |
"""
|
502 |
Tokenizes the dataset
|
|
|
185 |
self.dset = None # original dataset
|
186 |
# HF dataset with all of the self.text_field instances in self.dset
|
187 |
self.text_dset = None
|
188 |
+
self.dset_peek = None
|
189 |
# HF dataset with text embeddings in the same order as self.text_dset
|
190 |
self.embeddings_dset = None
|
191 |
# HF dataset with all of the self.label_field instances in self.dset
|
|
|
255 |
logs.warning("Creating cache directory %s." % self.cache_path)
|
256 |
mkdir(self.cache_path)
|
257 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
258 |
+
self.dset_peek_fid = pjoin(self.cache_path, "dset_peek.json")
|
259 |
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
260 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
261 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
|
|
285 |
use_streaming=True,
|
286 |
)
|
287 |
|
|
|
|
|
|
|
|
|
288 |
def load_or_prepare_general_stats(self, use_cache=False, save=True):
|
289 |
"""
|
290 |
Content for expander_general_stats widget.
|
|
|
460 |
self.load_or_prepare_text_dset(use_cache, save)
|
461 |
logs.info("Doing tokenized dataframe")
|
462 |
self.load_or_prepare_tokenized_df(use_cache, save)
|
463 |
+
logs.info("Doing dataset peek")
|
464 |
+
self.load_or_prepare_dset_peek(save, use_cache)
|
465 |
|
466 |
+
def load_or_prepare_dset_peek(self, save, use_cache):
|
467 |
+
if use_cache and exists(self.dset_peek_fid):
|
468 |
+
with open(self.dset_peek_fid, "r") as f:
|
469 |
+
self.dset_peek = json.load(f)["dset peek"]
|
470 |
+
else:
|
471 |
+
if self.dset is None:
|
472 |
+
self.get_base_dataset()
|
473 |
+
self.dset_peek = self.dset[:100]
|
474 |
+
if save:
|
475 |
+
write_json({"dset_peek": self.dset_peek}, self.dset_peek_fid)
|
476 |
|
477 |
def load_or_prepare_tokenized_df(self, use_cache, save):
|
478 |
if (use_cache and exists(self.tokenized_df_fid)):
|
|
|
493 |
logs.info(self.text_dset)
|
494 |
# ...Or load it from the server and store it anew
|
495 |
else:
|
496 |
+
self.prepare_text_dset()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
if save:
|
498 |
# save extracted text instances
|
499 |
logs.warning("Saving dataset to disk")
|
500 |
self.text_dset.save_to_disk(self.text_dset_fid)
|
501 |
|
502 |
+
def prepare_text_dset(self):
|
503 |
+
self.get_base_dataset()
|
504 |
+
# extract all text instances
|
505 |
+
self.text_dset = self.dset.map(
|
506 |
+
lambda examples: extract_field(
|
507 |
+
examples, self.text_field, OUR_TEXT_FIELD
|
508 |
+
),
|
509 |
+
batched=True,
|
510 |
+
remove_columns=list(self.dset.features),
|
511 |
+
)
|
512 |
+
|
513 |
def do_tokenization(self):
|
514 |
"""
|
515 |
Tokenizes the dataset
|
data_measurements/streamlit_utils.py
CHANGED
@@ -99,7 +99,7 @@ def expander_header(dstats, ds_name_to_dict, column_id):
|
|
99 |
st.markdown(
|
100 |
ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
|
101 |
)
|
102 |
-
st.dataframe(dstats.
|
103 |
|
104 |
|
105 |
def expander_general_stats(dstats, column_id):
|
|
|
99 |
st.markdown(
|
100 |
ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
|
101 |
)
|
102 |
+
st.dataframe(dstats.dset_peek)
|
103 |
|
104 |
|
105 |
def expander_general_stats(dstats, column_id):
|