Spaces:
Build error
Build error
Sasha
commited on
Commit
•
abff13d
1
Parent(s):
64ba64c
Changing text lengths plot to a static one, saving to .png
Browse files
data_measurements/dataset_statistics.py
CHANGED
@@ -28,6 +28,8 @@ import plotly.express as px
|
|
28 |
import plotly.figure_factory as ff
|
29 |
import plotly.graph_objects as go
|
30 |
import pyarrow.feather as feather
|
|
|
|
|
31 |
from datasets import load_from_disk
|
32 |
from nltk.corpus import stopwords
|
33 |
from sklearn.feature_extraction.text import CountVectorizer
|
@@ -281,7 +283,7 @@ class DatasetStatisticsCacheClass:
|
|
281 |
# Needed for UI
|
282 |
self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
|
283 |
# Needed for UI
|
284 |
-
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.
|
285 |
|
286 |
## General text stats
|
287 |
# Needed for UI
|
@@ -363,13 +365,12 @@ class DatasetStatisticsCacheClass:
|
|
363 |
"""
|
364 |
# Text length figure
|
365 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
366 |
-
self.fig_tok_length =
|
367 |
else:
|
368 |
if not self.live:
|
369 |
self.prepare_fig_text_lengths()
|
370 |
if save:
|
371 |
-
|
372 |
-
|
373 |
# Text length dataframe
|
374 |
if self.use_cache and exists(self.length_df_fid):
|
375 |
self.length_df = feather.read_feather(self.length_df_fid)
|
@@ -1037,9 +1038,9 @@ def read_plotly(fid):
|
|
1037 |
return fig
|
1038 |
|
1039 |
def make_fig_lengths(tokenized_df, length_field):
|
1040 |
-
fig_tok_length =
|
1041 |
-
|
1042 |
-
)
|
1043 |
return fig_tok_length
|
1044 |
|
1045 |
def make_fig_labels(label_df, label_names, label_field):
|
|
|
28 |
import plotly.figure_factory as ff
|
29 |
import plotly.graph_objects as go
|
30 |
import pyarrow.feather as feather
|
31 |
+
import matplotlib.pyplot as plt
|
32 |
+
import seaborn as sns
|
33 |
from datasets import load_from_disk
|
34 |
from nltk.corpus import stopwords
|
35 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
283 |
# Needed for UI
|
284 |
self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
|
285 |
# Needed for UI
|
286 |
+
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.png")
|
287 |
|
288 |
## General text stats
|
289 |
# Needed for UI
|
|
|
365 |
"""
|
366 |
# Text length figure
|
367 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
368 |
+
self.fig_tok_length = plt.imread(self.fig_tok_length_fid)
|
369 |
else:
|
370 |
if not self.live:
|
371 |
self.prepare_fig_text_lengths()
|
372 |
if save:
|
373 |
+
self.fig_tok_length.savefig(self.fig_tok_length_fid)
|
|
|
374 |
# Text length dataframe
|
375 |
if self.use_cache and exists(self.length_df_fid):
|
376 |
self.length_df = feather.read_feather(self.length_df_fid)
|
|
|
1038 |
return fig
|
1039 |
|
1040 |
def make_fig_lengths(tokenized_df, length_field):
|
1041 |
+
fig_tok_length, axs = plt.subplots(figsize=(15, 6), dpi=150)
|
1042 |
+
sns.histplot(data=tokenized_df[length_field], kde=True, bins=100, ax=axs)
|
1043 |
+
sns.rugplot(data=tokenized_df[length_field], ax=axs)
|
1044 |
return fig_tok_length
|
1045 |
|
1046 |
def make_fig_labels(label_df, label_names, label_field):
|
data_measurements/streamlit_utils.py
CHANGED
@@ -167,7 +167,7 @@ def expander_text_lengths(dstats, column_id):
|
|
167 |
st.markdown(
|
168 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
169 |
)
|
170 |
-
st.
|
171 |
st.markdown(
|
172 |
"The average length of text instances is **"
|
173 |
+ str(dstats.avg_length)
|
@@ -187,7 +187,7 @@ def expander_text_lengths(dstats, column_id):
|
|
187 |
# This is quite a large file and is breaking our ability to navigate the app development.
|
188 |
# Just passing if it's not already there for launch v0
|
189 |
if dstats.length_df is not None:
|
190 |
-
st.
|
191 |
|
192 |
|
193 |
### Third, use a sentence embedding model
|
|
|
167 |
st.markdown(
|
168 |
"### Here is the relative frequency of different text lengths in your dataset:"
|
169 |
)
|
170 |
+
st.pyplot(dstats.fig_tok_length, use_container_width=True)
|
171 |
st.markdown(
|
172 |
"The average length of text instances is **"
|
173 |
+ str(dstats.avg_length)
|
|
|
187 |
# This is quite a large file and is breaking our ability to navigate the app development.
|
188 |
# Just passing if it's not already there for launch v0
|
189 |
if dstats.length_df is not None:
|
190 |
+
st.table(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
|
191 |
|
192 |
|
193 |
### Third, use a sentence embedding model
|