Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
335424f
1
Parent(s):
85cf91c
Some additional modularizing and caching of the text lengths widget
Browse files
data_measurements/dataset_statistics.py
CHANGED
@@ -219,6 +219,7 @@ class DatasetStatisticsCacheClass:
|
|
219 |
self.avg_length = None
|
220 |
self.std_length = None
|
221 |
self.general_stats_dict = None
|
|
|
222 |
# clustering text by embeddings
|
223 |
# the hierarchical clustering tree is represented as a list of nodes,
|
224 |
# the first is the root
|
@@ -351,6 +352,7 @@ class DatasetStatisticsCacheClass:
|
|
351 |
self.length_stats_dict = json.load(f)
|
352 |
self.avg_length = self.length_stats_dict["avg length"]
|
353 |
self.std_length = self.length_stats_dict["std length"]
|
|
|
354 |
else:
|
355 |
self.prepare_text_length_stats()
|
356 |
if save:
|
@@ -367,14 +369,16 @@ class DatasetStatisticsCacheClass:
|
|
367 |
)
|
368 |
|
369 |
def prepare_text_length_stats(self):
|
370 |
-
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
371 |
self.prepare_length_df()
|
372 |
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
373 |
self.avg_length = round(avg_length, 1)
|
374 |
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
375 |
self.std_length = round(std_length, 1)
|
|
|
376 |
self.length_stats_dict = {"avg length": self.avg_length,
|
377 |
-
"std length": self.std_length
|
|
|
378 |
|
379 |
def prepare_fig_text_lengths(self):
|
380 |
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
|
|
219 |
self.avg_length = None
|
220 |
self.std_length = None
|
221 |
self.general_stats_dict = None
|
222 |
+
self.num_uniq_lengths = 0
|
223 |
# clustering text by embeddings
|
224 |
# the hierarchical clustering tree is represented as a list of nodes,
|
225 |
# the first is the root
|
|
|
352 |
self.length_stats_dict = json.load(f)
|
353 |
self.avg_length = self.length_stats_dict["avg length"]
|
354 |
self.std_length = self.length_stats_dict["std length"]
|
355 |
+
self.num_uniq_lengths = self.length_stats_dict["num lengths"]
|
356 |
else:
|
357 |
self.prepare_text_length_stats()
|
358 |
if save:
|
|
|
369 |
)
|
370 |
|
371 |
def prepare_text_length_stats(self):
|
372 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
|
373 |
self.prepare_length_df()
|
374 |
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
375 |
self.avg_length = round(avg_length, 1)
|
376 |
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
377 |
self.std_length = round(std_length, 1)
|
378 |
+
self.num_uniq_lengths = len(self.length_df["length"].unique())
|
379 |
self.length_stats_dict = {"avg length": self.avg_length,
|
380 |
+
"std length": self.std_length,
|
381 |
+
"num lengths": self.num_uniq_lengths}
|
382 |
|
383 |
def prepare_fig_text_lengths(self):
|
384 |
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
data_measurements/streamlit_utils.py
CHANGED
@@ -147,9 +147,7 @@ def expander_label_distribution(fig_labels, column_id):
|
|
147 |
st.markdown("No labels were found in the dataset")
|
148 |
|
149 |
|
150 |
-
def expander_text_lengths(dstats,
|
151 |
-
column_id,
|
152 |
-
):
|
153 |
_TEXT_LENGTH_CAPTION = (
|
154 |
"Use this widget to identify outliers, particularly suspiciously long outliers."
|
155 |
)
|
@@ -176,7 +174,7 @@ def expander_text_lengths(dstats,
|
|
176 |
start_id_show_lengths = st.slider(
|
177 |
f"Show the shortest sentences{column_id} starting at:",
|
178 |
0,
|
179 |
-
|
180 |
value=0,
|
181 |
step=1,
|
182 |
)
|
|
|
147 |
st.markdown("No labels were found in the dataset")
|
148 |
|
149 |
|
150 |
+
def expander_text_lengths(dstats, column_id):
|
|
|
|
|
151 |
_TEXT_LENGTH_CAPTION = (
|
152 |
"Use this widget to identify outliers, particularly suspiciously long outliers."
|
153 |
)
|
|
|
174 |
start_id_show_lengths = st.slider(
|
175 |
f"Show the shortest sentences{column_id} starting at:",
|
176 |
0,
|
177 |
+
dstats.num_uniq_lengths,
|
178 |
value=0,
|
179 |
step=1,
|
180 |
)
|