Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
e1f2cc3
1
Parent(s):
6af9ef6
Removing any need for a dataframe in expander_general_stats; instead making sure to cache and load the small amount of details needed for this widget. Note I also moved around a couple functions -- same content, just moved -- so that it was easier for me to navigate through the code. I also pulled out a couple of sub-functions from larger functions, again to make the code easier to work with/understand, as well as helping to further modularize so we can limit what needs to be cached.
Browse files- app.py +2 -3
- data_measurements/dataset_statistics.py +120 -83
- data_measurements/dataset_utils.py +2 -0
- data_measurements/streamlit_utils.py +17 -15
app.py
CHANGED
@@ -143,7 +143,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
143 |
logs.info("showing header")
|
144 |
st_utils.expander_header(dstats, ds_name_to_dict, column_id)
|
145 |
logs.info("showing general stats")
|
146 |
-
st_utils.expander_general_stats(dstats,
|
147 |
st_utils.expander_label_distribution(dstats.label_df, dstats.fig_labels, column_id)
|
148 |
st_utils.expander_text_lengths(
|
149 |
dstats.tokenized_df,
|
@@ -154,7 +154,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
154 |
LENGTH_FIELD,
|
155 |
column_id,
|
156 |
)
|
157 |
-
st_utils.expander_text_duplicates(dstats
|
158 |
|
159 |
# We do the loading of these after the others in order to have some time
|
160 |
# to compute while the user works with the details above.
|
@@ -191,7 +191,6 @@ def main():
|
|
191 |
|
192 |
# When not doing new development, use the cache.
|
193 |
use_cache = True
|
194 |
-
# TODO: Better handling of this eg, st.sidebar.checkbox("Show clustering")=
|
195 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
196 |
# List of datasets for which embeddings are hard to compute:
|
197 |
|
|
|
143 |
logs.info("showing header")
|
144 |
st_utils.expander_header(dstats, ds_name_to_dict, column_id)
|
145 |
logs.info("showing general stats")
|
146 |
+
st_utils.expander_general_stats(dstats, column_id)
|
147 |
st_utils.expander_label_distribution(dstats.label_df, dstats.fig_labels, column_id)
|
148 |
st_utils.expander_text_lengths(
|
149 |
dstats.tokenized_df,
|
|
|
154 |
LENGTH_FIELD,
|
155 |
column_id,
|
156 |
)
|
157 |
+
st_utils.expander_text_duplicates(dstats, column_id)
|
158 |
|
159 |
# We do the loading of these after the others in order to have some time
|
160 |
# to compute while the user works with the details above.
|
|
|
191 |
|
192 |
# When not doing new development, use the cache.
|
193 |
use_cache = True
|
|
|
194 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
195 |
# List of datasets for which embeddings are hard to compute:
|
196 |
|
data_measurements/dataset_statistics.py
CHANGED
@@ -33,6 +33,8 @@ from nltk.corpus import stopwords
|
|
33 |
from sklearn.feature_extraction.text import CountVectorizer
|
34 |
|
35 |
from .dataset_utils import (
|
|
|
|
|
36 |
CNT,
|
37 |
DEDUP_TOT,
|
38 |
EMBEDDING_FIELD,
|
@@ -143,13 +145,9 @@ _TREE_MIN_NODES = 250
|
|
143 |
# as long as we're using sklearn - already pushing the resources
|
144 |
_MAX_CLUSTER_EXAMPLES = 5000
|
145 |
_NUM_VOCAB_BATCHES = 2000
|
146 |
-
|
147 |
-
|
148 |
_CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
|
149 |
|
150 |
-
num_rows = 200000
|
151 |
-
|
152 |
-
|
153 |
class DatasetStatisticsCacheClass:
|
154 |
def __init__(
|
155 |
self,
|
@@ -193,7 +191,7 @@ class DatasetStatisticsCacheClass:
|
|
193 |
self.label_dset = None
|
194 |
## Data frames
|
195 |
# Tokenized text
|
196 |
-
self.tokenized_df =
|
197 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
198 |
self.fig_tok_length = None
|
199 |
# Data Frame version of self.label_dset
|
@@ -205,12 +203,14 @@ class DatasetStatisticsCacheClass:
|
|
205 |
# Vocabulary filtered to remove stopwords
|
206 |
self.vocab_counts_filtered_df = None
|
207 |
## General statistics and duplicates
|
|
|
|
|
208 |
# Number of NaN values (NOT empty strings)
|
209 |
self.text_nan_count = 0
|
210 |
# Number of text items that appear more than once in the dataset
|
211 |
self.dedup_total = 0
|
212 |
# Duplicated text items along with their number of occurences ("count")
|
213 |
-
self.
|
214 |
self.avg_length = None
|
215 |
self.std_length = None
|
216 |
self.general_stats_dict = None
|
@@ -258,10 +258,12 @@ class DatasetStatisticsCacheClass:
|
|
258 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
259 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
260 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
261 |
-
self.general_stats_fid = pjoin(self.cache_path, "
|
262 |
-
self.
|
263 |
-
self.cache_path, "
|
264 |
)
|
|
|
|
|
265 |
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
|
266 |
self.fig_labels_fid = pjoin(self.cache_path, "fig_labels.json")
|
267 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
@@ -285,38 +287,47 @@ class DatasetStatisticsCacheClass:
|
|
285 |
self.get_base_dataset()
|
286 |
return self.dset[:100]
|
287 |
|
288 |
-
def load_or_prepare_general_stats(self, use_cache=False):
|
289 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
-
|
|
|
|
|
292 |
# General statistics
|
293 |
if (
|
294 |
use_cache
|
295 |
and exists(self.general_stats_fid)
|
296 |
-
and exists(self.
|
|
|
297 |
):
|
298 |
-
|
299 |
-
|
300 |
-
)
|
301 |
else:
|
302 |
-
(
|
303 |
-
|
304 |
-
|
305 |
-
self.
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
|
|
|
|
313 |
|
314 |
def load_or_prepare_text_lengths(self, use_cache=False, save=True):
|
315 |
# TODO: Everything here can be read from cache; it's in a transitory
|
316 |
# state atm where just the fig is cached. Clean up.
|
317 |
if use_cache and exists(self.fig_tok_length_fid):
|
318 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
319 |
-
if
|
320 |
self.tokenized_df = self.do_tokenization()
|
321 |
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(len)
|
322 |
self.avg_length = round(
|
@@ -385,56 +396,54 @@ class DatasetStatisticsCacheClass:
|
|
385 |
logs.info("filtered vocab")
|
386 |
logs.info(self.vocab_counts_filtered_df)
|
387 |
|
388 |
-
def
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
# TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
|
394 |
-
# when only reading from cache. Either the UI should use it, or it should
|
395 |
-
# be removed when reading in cache
|
396 |
-
if use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
|
397 |
-
with open(self.zipf_fid, "r") as f:
|
398 |
-
zipf_dict = json.load(f)
|
399 |
-
self.z = Zipf()
|
400 |
-
self.z.load(zipf_dict)
|
401 |
-
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
402 |
-
elif use_cache and exists(self.zipf_fid):
|
403 |
-
# TODO: Read zipf data so that the vocab is there.
|
404 |
-
with open(self.zipf_fid, "r") as f:
|
405 |
-
zipf_dict = json.load(f)
|
406 |
-
self.z = Zipf()
|
407 |
-
self.z.load(zipf_dict)
|
408 |
-
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
409 |
-
if save:
|
410 |
-
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
411 |
-
else:
|
412 |
-
self.z = Zipf(self.vocab_counts_df)
|
413 |
-
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
414 |
-
if save:
|
415 |
-
write_zipf_data(self.z, self.zipf_fid)
|
416 |
-
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
417 |
|
418 |
-
def
|
419 |
-
|
420 |
-
|
421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
dup_df.pivot_table(
|
423 |
-
columns=[
|
424 |
).sort_values(ascending=False),
|
425 |
columns=[CNT],
|
426 |
)
|
427 |
-
|
428 |
-
|
429 |
-
dedup_total = sum(
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
with open(text_duplicate_counts_df_fid, "rb") as f:
|
437 |
-
self.text_dup_counts_df = feather.read_feather(f)
|
438 |
|
439 |
def load_or_prepare_dataset(self, use_cache=True, save=True):
|
440 |
"""
|
@@ -449,20 +458,24 @@ class DatasetStatisticsCacheClass:
|
|
449 |
Returns:
|
450 |
|
451 |
"""
|
452 |
-
|
453 |
-
self.
|
|
|
|
|
454 |
|
455 |
-
|
|
|
456 |
if (use_cache and exists(self.tokenized_df_fid)):
|
457 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
458 |
else:
|
459 |
# tokenize all text instances
|
460 |
self.tokenized_df = self.do_tokenization()
|
461 |
if save:
|
|
|
462 |
# save tokenized text
|
463 |
write_df(self.tokenized_df, self.tokenized_df_fid)
|
464 |
|
465 |
-
def load_or_prepare_text_dset(self,
|
466 |
if (use_cache and exists(self.text_dset_fid)):
|
467 |
# load extracted text
|
468 |
self.text_dset = load_from_disk(self.text_dset_fid)
|
@@ -557,11 +570,35 @@ class DatasetStatisticsCacheClass:
|
|
557 |
self.label_dset.save_to_disk(self.label_dset_fid)
|
558 |
write_plotly(self.fig_labels, self.fig_labels_fid)
|
559 |
|
560 |
-
def
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
|
566 |
def _set_idx_col_names(self, input_vocab_df):
|
567 |
if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
|
|
|
33 |
from sklearn.feature_extraction.text import CountVectorizer
|
34 |
|
35 |
from .dataset_utils import (
|
36 |
+
TOT_WORDS,
|
37 |
+
TOT_OPEN_WORDS,
|
38 |
CNT,
|
39 |
DEDUP_TOT,
|
40 |
EMBEDDING_FIELD,
|
|
|
145 |
# as long as we're using sklearn - already pushing the resources
|
146 |
_MAX_CLUSTER_EXAMPLES = 5000
|
147 |
_NUM_VOCAB_BATCHES = 2000
|
148 |
+
_TOP_N = 100
|
|
|
149 |
_CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
|
150 |
|
|
|
|
|
|
|
151 |
class DatasetStatisticsCacheClass:
|
152 |
def __init__(
|
153 |
self,
|
|
|
191 |
self.label_dset = None
|
192 |
## Data frames
|
193 |
# Tokenized text
|
194 |
+
self.tokenized_df = None
|
195 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
196 |
self.fig_tok_length = None
|
197 |
# Data Frame version of self.label_dset
|
|
|
203 |
# Vocabulary filtered to remove stopwords
|
204 |
self.vocab_counts_filtered_df = None
|
205 |
## General statistics and duplicates
|
206 |
+
self.total_words = 0
|
207 |
+
self.total_open_words = 0
|
208 |
# Number of NaN values (NOT empty strings)
|
209 |
self.text_nan_count = 0
|
210 |
# Number of text items that appear more than once in the dataset
|
211 |
self.dedup_total = 0
|
212 |
# Duplicated text items along with their number of occurences ("count")
|
213 |
+
self.dup_counts_df = None
|
214 |
self.avg_length = None
|
215 |
self.std_length = None
|
216 |
self.general_stats_dict = None
|
|
|
258 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
259 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
260 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
261 |
+
self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
|
262 |
+
self.dup_counts_df_fid = pjoin(
|
263 |
+
self.cache_path, "dup_counts_df.feather"
|
264 |
)
|
265 |
+
self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
|
266 |
+
"sorted_top_vocab.feather")
|
267 |
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
|
268 |
self.fig_labels_fid = pjoin(self.cache_path, "fig_labels.json")
|
269 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
|
|
287 |
self.get_base_dataset()
|
288 |
return self.dset[:100]
|
289 |
|
290 |
+
def load_or_prepare_general_stats(self, use_cache=False, save=True):
|
291 |
+
"""
|
292 |
+
Content for expander_general_stats widget.
|
293 |
+
Provides statistics for total words, total open words,
|
294 |
+
the sorted top vocab, the NaN count, and the duplicate count.
|
295 |
+
Args:
|
296 |
+
use_cache:
|
297 |
|
298 |
+
Returns:
|
299 |
+
|
300 |
+
"""
|
301 |
# General statistics
|
302 |
if (
|
303 |
use_cache
|
304 |
and exists(self.general_stats_fid)
|
305 |
+
and exists(self.dup_counts_df_fid)
|
306 |
+
and exists(self.sorted_top_vocab_df_fid)
|
307 |
):
|
308 |
+
print('Loading cached general stats')
|
309 |
+
self.load_general_stats()
|
|
|
310 |
else:
|
311 |
+
print('Preparing general stats')
|
312 |
+
self.prepare_general_stats()
|
313 |
+
if save:
|
314 |
+
print(self.sorted_top_vocab_df)
|
315 |
+
print(self.sorted_top_vocab_df_fid)
|
316 |
+
write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
|
317 |
+
print(self.dup_counts_df)
|
318 |
+
print(self.dup_counts_df_fid)
|
319 |
+
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
320 |
+
print(self.general_stats_dict)
|
321 |
+
print(self.general_stats_fid)
|
322 |
+
write_json(self.general_stats_dict, self.general_stats_fid)
|
323 |
+
|
324 |
|
325 |
def load_or_prepare_text_lengths(self, use_cache=False, save=True):
|
326 |
# TODO: Everything here can be read from cache; it's in a transitory
|
327 |
# state atm where just the fig is cached. Clean up.
|
328 |
if use_cache and exists(self.fig_tok_length_fid):
|
329 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
330 |
+
if self.tokenized_df is None:
|
331 |
self.tokenized_df = self.do_tokenization()
|
332 |
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(len)
|
333 |
self.avg_length = round(
|
|
|
396 |
logs.info("filtered vocab")
|
397 |
logs.info(self.vocab_counts_filtered_df)
|
398 |
|
399 |
+
def load_vocab(self):
|
400 |
+
with open(self.vocab_counts_df_fid, "rb") as f:
|
401 |
+
self.vocab_counts_df = feather.read_feather(f)
|
402 |
+
# Handling for changes in how the index is saved.
|
403 |
+
self.vocab_counts_df = self._set_idx_col_names(self.vocab_counts_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
|
405 |
+
def load_general_stats(self):
|
406 |
+
self.general_stats_dict = json.load(open(self.general_stats_fid, encoding="utf-8"))
|
407 |
+
with open(self.dup_counts_df_fid, "rb") as f:
|
408 |
+
self.dup_counts_df = feather.read_feather(f)
|
409 |
+
with open(self.sorted_top_vocab_df_fid, "rb") as f:
|
410 |
+
self.sorted_top_vocab_df = feather.read_feather(f)
|
411 |
+
self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
|
412 |
+
self.dedup_total = self.general_stats_dict[DEDUP_TOT]
|
413 |
+
self.total_words = self.general_stats_dict[TOT_WORDS]
|
414 |
+
self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
|
415 |
+
|
416 |
+
def prepare_general_stats(self):
|
417 |
+
if self.tokenized_df is None:
|
418 |
+
logs.warning("Tokenized dataset not yet loaded; doing so.")
|
419 |
+
self.load_or_prepare_dataset()
|
420 |
+
if self.vocab_counts_df is None:
|
421 |
+
logs.warning("Vocab not yet loaded; doing so.")
|
422 |
+
self.load_or_prepare_vocab()
|
423 |
+
self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
|
424 |
+
"count", ascending=False
|
425 |
+
).head(_TOP_N)
|
426 |
+
print('basics')
|
427 |
+
self.total_words = len(self.vocab_counts_df)
|
428 |
+
self.total_open_words = len(self.vocab_counts_filtered_df)
|
429 |
+
self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
|
430 |
+
dup_df = self.tokenized_df[self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
|
431 |
+
print('dup df')
|
432 |
+
self.dup_counts_df = pd.DataFrame(
|
433 |
dup_df.pivot_table(
|
434 |
+
columns=[OUR_TEXT_FIELD], aggfunc="size"
|
435 |
).sort_values(ascending=False),
|
436 |
columns=[CNT],
|
437 |
)
|
438 |
+
print('deddup df')
|
439 |
+
self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
|
440 |
+
self.dedup_total = sum(self.dup_counts_df[CNT])
|
441 |
+
self.general_stats_dict = {
|
442 |
+
TOT_WORDS: self.total_words,
|
443 |
+
TOT_OPEN_WORDS: self.total_open_words,
|
444 |
+
TEXT_NAN_CNT: self.text_nan_count,
|
445 |
+
DEDUP_TOT: self.dedup_total,
|
446 |
+
}
|
|
|
|
|
447 |
|
448 |
def load_or_prepare_dataset(self, use_cache=True, save=True):
|
449 |
"""
|
|
|
458 |
Returns:
|
459 |
|
460 |
"""
|
461 |
+
logs.info("Doing text dset.")
|
462 |
+
self.load_or_prepare_text_dset(use_cache, save)
|
463 |
+
logs.info("Doing tokenized dataframe")
|
464 |
+
self.load_or_prepare_tokenized_df(use_cache, save)
|
465 |
|
466 |
+
|
467 |
+
def load_or_prepare_tokenized_df(self, use_cache, save):
|
468 |
if (use_cache and exists(self.tokenized_df_fid)):
|
469 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
470 |
else:
|
471 |
# tokenize all text instances
|
472 |
self.tokenized_df = self.do_tokenization()
|
473 |
if save:
|
474 |
+
logs.warning("Saving tokenized dataset to disk")
|
475 |
# save tokenized text
|
476 |
write_df(self.tokenized_df, self.tokenized_df_fid)
|
477 |
|
478 |
+
def load_or_prepare_text_dset(self, use_cache, save):
|
479 |
if (use_cache and exists(self.text_dset_fid)):
|
480 |
# load extracted text
|
481 |
self.text_dset = load_from_disk(self.text_dset_fid)
|
|
|
570 |
self.label_dset.save_to_disk(self.label_dset_fid)
|
571 |
write_plotly(self.fig_labels, self.fig_labels_fid)
|
572 |
|
573 |
+
def load_or_prepare_npmi_terms(self, use_cache=False):
|
574 |
+
self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=use_cache)
|
575 |
+
self.npmi_stats.load_or_prepare_npmi_terms()
|
576 |
+
|
577 |
+
def load_or_prepare_zipf(self, use_cache=False, save=True):
|
578 |
+
# TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
|
579 |
+
# when only reading from cache. Either the UI should use it, or it should
|
580 |
+
# be removed when reading in cache
|
581 |
+
if use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
|
582 |
+
with open(self.zipf_fid, "r") as f:
|
583 |
+
zipf_dict = json.load(f)
|
584 |
+
self.z = Zipf()
|
585 |
+
self.z.load(zipf_dict)
|
586 |
+
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
587 |
+
elif use_cache and exists(self.zipf_fid):
|
588 |
+
# TODO: Read zipf data so that the vocab is there.
|
589 |
+
with open(self.zipf_fid, "r") as f:
|
590 |
+
zipf_dict = json.load(f)
|
591 |
+
self.z = Zipf()
|
592 |
+
self.z.load(zipf_dict)
|
593 |
+
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
594 |
+
if save:
|
595 |
+
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
596 |
+
else:
|
597 |
+
self.z = Zipf(self.vocab_counts_df)
|
598 |
+
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
599 |
+
if save:
|
600 |
+
write_zipf_data(self.z, self.zipf_fid)
|
601 |
+
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
602 |
|
603 |
def _set_idx_col_names(self, input_vocab_df):
|
604 |
if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
|
data_measurements/dataset_utils.py
CHANGED
@@ -43,6 +43,8 @@ PROP = "proportion"
|
|
43 |
TEXT_NAN_CNT = "text_nan_count"
|
44 |
TXT_LEN = "text lengths"
|
45 |
DEDUP_TOT = "dedup_total"
|
|
|
|
|
46 |
|
47 |
_DATASET_LIST = [
|
48 |
"c4",
|
|
|
43 |
TEXT_NAN_CNT = "text_nan_count"
|
44 |
TXT_LEN = "text lengths"
|
45 |
DEDUP_TOT = "dedup_total"
|
46 |
+
TOT_WORDS = "total words"
|
47 |
+
TOT_OPEN_WORDS = "total open words"
|
48 |
|
49 |
_DATASET_LIST = [
|
50 |
"c4",
|
data_measurements/streamlit_utils.py
CHANGED
@@ -102,32 +102,34 @@ def expander_header(dstats, ds_name_to_dict, column_id):
|
|
102 |
st.dataframe(dstats.get_dataset_peek())
|
103 |
|
104 |
|
105 |
-
def expander_general_stats(dstats,
|
106 |
with st.expander(f"General Text Statistics{column_id}"):
|
107 |
st.caption(
|
108 |
-
"Use this widget to check whether the terms you see most represented
|
|
|
109 |
)
|
110 |
st.markdown(
|
111 |
-
"There are {0} total words".format(str(
|
112 |
)
|
113 |
st.markdown(
|
114 |
"There are {0} words after removing closed "
|
115 |
-
"class words".format(str(
|
116 |
)
|
117 |
-
sorted_top_vocab_df = dstats.vocab_counts_filtered_df.sort_values(
|
118 |
-
"count", ascending=False
|
119 |
-
).head(top_n)
|
120 |
st.markdown(
|
121 |
-
"The most common
|
|
|
|
|
122 |
)
|
123 |
-
st.dataframe(sorted_top_vocab_df)
|
124 |
st.markdown(
|
125 |
"There are {0} missing values in the dataset.".format(
|
126 |
str(dstats.text_nan_count)
|
127 |
)
|
128 |
)
|
129 |
st.markdown(
|
130 |
-
"There are {0} duplicate items in the dataset.
|
|
|
|
|
131 |
str(dstats.dedup_total)
|
132 |
)
|
133 |
)
|
@@ -269,7 +271,8 @@ def expander_text_embeddings(
|
|
269 |
|
270 |
|
271 |
### Then, show duplicates
|
272 |
-
def expander_text_duplicates(
|
|
|
273 |
with st.expander(f"Text Duplicates{column_id}", expanded=False):
|
274 |
st.caption(
|
275 |
"Use this widget to identify text strings that appear more than once."
|
@@ -277,16 +280,15 @@ def expander_text_duplicates(dedup_df, column_id):
|
|
277 |
st.markdown(
|
278 |
"A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
|
279 |
)
|
280 |
-
dedup_df["count"] = dedup_df["count"] + 1
|
281 |
st.markdown("------")
|
282 |
st.write(
|
283 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
284 |
)
|
285 |
# Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
|
286 |
-
if len(
|
287 |
st.write("There are no duplicates in this dataset! 🥳")
|
288 |
else:
|
289 |
-
gb = GridOptionsBuilder.from_dataframe(
|
290 |
gb.configure_column(
|
291 |
f"text{column_id}",
|
292 |
wrapText=True,
|
@@ -296,7 +298,7 @@ def expander_text_duplicates(dedup_df, column_id):
|
|
296 |
use_container_width=True,
|
297 |
)
|
298 |
go = gb.build()
|
299 |
-
AgGrid(
|
300 |
|
301 |
|
302 |
def expander_npmi_description(min_vocab):
|
|
|
102 |
st.dataframe(dstats.get_dataset_peek())
|
103 |
|
104 |
|
105 |
+
def expander_general_stats(dstats, column_id):
|
106 |
with st.expander(f"General Text Statistics{column_id}"):
|
107 |
st.caption(
|
108 |
+
"Use this widget to check whether the terms you see most represented"
|
109 |
+
" in the dataset make sense for the goals of the dataset."
|
110 |
)
|
111 |
st.markdown(
|
112 |
+
"There are {0} total words".format(str(dstats.total_words))
|
113 |
)
|
114 |
st.markdown(
|
115 |
"There are {0} words after removing closed "
|
116 |
+
"class words".format(str(dstats.total_open_words))
|
117 |
)
|
|
|
|
|
|
|
118 |
st.markdown(
|
119 |
+
"The most common "
|
120 |
+
"[open class words](https://dictionary.apa.org/open-class-words) "
|
121 |
+
"and their counts are: "
|
122 |
)
|
123 |
+
st.dataframe(dstats.sorted_top_vocab_df)
|
124 |
st.markdown(
|
125 |
"There are {0} missing values in the dataset.".format(
|
126 |
str(dstats.text_nan_count)
|
127 |
)
|
128 |
)
|
129 |
st.markdown(
|
130 |
+
"There are {0} duplicate items in the dataset. "
|
131 |
+
"For more information about the duplicates, "
|
132 |
+
"click the 'Duplicates' tab below.".format(
|
133 |
str(dstats.dedup_total)
|
134 |
)
|
135 |
)
|
|
|
271 |
|
272 |
|
273 |
### Then, show duplicates
|
274 |
+
def expander_text_duplicates(dstats, column_id):
|
275 |
+
# TODO: Saving/loading figure
|
276 |
with st.expander(f"Text Duplicates{column_id}", expanded=False):
|
277 |
st.caption(
|
278 |
"Use this widget to identify text strings that appear more than once."
|
|
|
280 |
st.markdown(
|
281 |
"A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
|
282 |
)
|
|
|
283 |
st.markdown("------")
|
284 |
st.write(
|
285 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
286 |
)
|
287 |
# Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
|
288 |
+
if len(dstats.dup_counts_df) == 0:
|
289 |
st.write("There are no duplicates in this dataset! 🥳")
|
290 |
else:
|
291 |
+
gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
|
292 |
gb.configure_column(
|
293 |
f"text{column_id}",
|
294 |
wrapText=True,
|
|
|
298 |
use_container_width=True,
|
299 |
)
|
300 |
go = gb.build()
|
301 |
+
AgGrid(dstats.dup_counts_df, gridOptions=go)
|
302 |
|
303 |
|
304 |
def expander_npmi_description(min_vocab):
|