Sasha commited on
Commit
abff13d
1 Parent(s): 64ba64c

Changing text lengths plot to a static one, saving to .png

Browse files
data_measurements/dataset_statistics.py CHANGED
@@ -28,6 +28,8 @@ import plotly.express as px
28
  import plotly.figure_factory as ff
29
  import plotly.graph_objects as go
30
  import pyarrow.feather as feather
 
 
31
  from datasets import load_from_disk
32
  from nltk.corpus import stopwords
33
  from sklearn.feature_extraction.text import CountVectorizer
@@ -281,7 +283,7 @@ class DatasetStatisticsCacheClass:
281
  # Needed for UI
282
  self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
283
  # Needed for UI
284
- self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
285
 
286
  ## General text stats
287
  # Needed for UI
@@ -363,13 +365,12 @@ class DatasetStatisticsCacheClass:
363
  """
364
  # Text length figure
365
  if (self.use_cache and exists(self.fig_tok_length_fid)):
366
- self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
367
  else:
368
  if not self.live:
369
  self.prepare_fig_text_lengths()
370
  if save:
371
- write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
372
-
373
  # Text length dataframe
374
  if self.use_cache and exists(self.length_df_fid):
375
  self.length_df = feather.read_feather(self.length_df_fid)
@@ -1037,9 +1038,9 @@ def read_plotly(fid):
1037
  return fig
1038
 
1039
  def make_fig_lengths(tokenized_df, length_field):
1040
- fig_tok_length = px.histogram(
1041
- tokenized_df, x=length_field, marginal="rug", hover_data=[length_field]
1042
- )
1043
  return fig_tok_length
1044
 
1045
  def make_fig_labels(label_df, label_names, label_field):
 
28
  import plotly.figure_factory as ff
29
  import plotly.graph_objects as go
30
  import pyarrow.feather as feather
31
+ import matplotlib.pyplot as plt
32
+ import seaborn as sns
33
  from datasets import load_from_disk
34
  from nltk.corpus import stopwords
35
  from sklearn.feature_extraction.text import CountVectorizer
 
283
  # Needed for UI
284
  self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
285
  # Needed for UI
286
+ self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.png")
287
 
288
  ## General text stats
289
  # Needed for UI
 
365
  """
366
  # Text length figure
367
  if (self.use_cache and exists(self.fig_tok_length_fid)):
368
+ self.fig_tok_length = plt.imread(self.fig_tok_length_fid)
369
  else:
370
  if not self.live:
371
  self.prepare_fig_text_lengths()
372
  if save:
373
+ self.fig_tok_length.savefig(self.fig_tok_length_fid)
 
374
  # Text length dataframe
375
  if self.use_cache and exists(self.length_df_fid):
376
  self.length_df = feather.read_feather(self.length_df_fid)
 
1038
  return fig
1039
 
1040
  def make_fig_lengths(tokenized_df, length_field):
1041
+ fig_tok_length, axs = plt.subplots(figsize=(15, 6), dpi=150)
1042
+ sns.histplot(data=tokenized_df[length_field], kde=True, bins=100, ax=axs)
1043
+ sns.rugplot(data=tokenized_df[length_field], ax=axs)
1044
  return fig_tok_length
1045
 
1046
  def make_fig_labels(label_df, label_names, label_field):
data_measurements/streamlit_utils.py CHANGED
@@ -167,7 +167,7 @@ def expander_text_lengths(dstats, column_id):
167
  st.markdown(
168
  "### Here is the relative frequency of different text lengths in your dataset:"
169
  )
170
- st.plotly_chart(dstats.fig_tok_length, use_container_width=True)
171
  st.markdown(
172
  "The average length of text instances is **"
173
  + str(dstats.avg_length)
@@ -187,7 +187,7 @@ def expander_text_lengths(dstats, column_id):
187
  # This is quite a large file and is breaking our ability to navigate the app development.
188
  # Just passing if it's not already there for launch v0
189
  if dstats.length_df is not None:
190
- st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
191
 
192
 
193
  ### Third, use a sentence embedding model
 
167
  st.markdown(
168
  "### Here is the relative frequency of different text lengths in your dataset:"
169
  )
170
+ st.pyplot(dstats.fig_tok_length, use_container_width=True)
171
  st.markdown(
172
  "The average length of text instances is **"
173
  + str(dstats.avg_length)
 
187
  # This is quite a large file and is breaking our ability to navigate the app development.
188
  # Just passing if it's not already there for launch v0
189
  if dstats.length_df is not None:
190
+ st.table(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
191
 
192
 
193
  ### Third, use a sentence embedding model