Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
14e5c2a
1
Parent(s):
1a4c18a
Try..except catching for errors
Browse files- app.py +48 -18
- data_measurements/streamlit_utils.py +67 -64
app.py
CHANGED
@@ -150,25 +150,55 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
150 |
mkdir(CACHE_DIR)
|
151 |
if use_cache:
|
152 |
logs.warning("Using cache")
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
if show_embeddings:
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
return dstats
|
173 |
|
174 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
|
|
150 |
mkdir(CACHE_DIR)
|
151 |
if use_cache:
|
152 |
logs.warning("Using cache")
|
153 |
+
try:
|
154 |
+
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
155 |
+
# Don't recalculate; we're live
|
156 |
+
dstats.set_deployment(True)
|
157 |
+
except:
|
158 |
+
logs.warning("We're screwed")
|
159 |
+
try:
|
160 |
+
# We need to have the text_dset loaded for further load_or_prepare
|
161 |
+
dstats.load_or_prepare_dataset()
|
162 |
+
except:
|
163 |
+
logs.warning("Missing a cache for load or prepare dataset")
|
164 |
+
try:
|
165 |
+
# Header widget
|
166 |
+
dstats.load_or_prepare_dset_peek()
|
167 |
+
except:
|
168 |
+
logs.warning("Missing a cache for dset peek")
|
169 |
+
try:
|
170 |
+
# General stats widget
|
171 |
+
dstats.load_or_prepare_general_stats()
|
172 |
+
except:
|
173 |
+
logs.warning("Missing a cache for general stats")
|
174 |
+
try:
|
175 |
+
# Labels widget
|
176 |
+
dstats.load_or_prepare_labels()
|
177 |
+
except:
|
178 |
+
logs.warning("Missing a cache for prepare labels")
|
179 |
+
try:
|
180 |
+
# Text lengths widget
|
181 |
+
dstats.load_or_prepare_text_lengths()
|
182 |
+
except:
|
183 |
+
logs.warning("Missing a cache for text lengths")
|
184 |
if show_embeddings:
|
185 |
+
try:
|
186 |
+
# Embeddings widget
|
187 |
+
dstats.load_or_prepare_embeddings()
|
188 |
+
except:
|
189 |
+
logs.warning("Missing a cache for embeddings")
|
190 |
+
try:
|
191 |
+
dstats.load_or_prepare_text_duplicates()
|
192 |
+
except:
|
193 |
+
logs.warning("Missing a cache for text duplicates")
|
194 |
+
try:
|
195 |
+
dstats.load_or_prepare_npmi()
|
196 |
+
except:
|
197 |
+
logs.warning("Missing a cache for npmi")
|
198 |
+
try:
|
199 |
+
dstats.load_or_prepare_zipf()
|
200 |
+
except:
|
201 |
+
logs.warning("Missing a cache for zipf")
|
202 |
return dstats
|
203 |
|
204 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
data_measurements/streamlit_utils.py
CHANGED
@@ -319,72 +319,75 @@ def expander_npmi_description(min_vocab):
|
|
319 |
|
320 |
### Finally, show Zipf stuff
|
321 |
def expander_zipf(z, zipf_fig, column_id):
|
322 |
-
_ZIPF_CAPTION = """This shows how close the observed language is to an ideal
|
323 |
-
natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
|
324 |
-
calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
|
325 |
-
|
326 |
-
powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
|
327 |
-
zipf_summary = (
|
328 |
-
"The optimal alpha based on this dataset is: **"
|
329 |
-
+ str(round(z.alpha, 2))
|
330 |
-
+ "**, with a KS distance of: **"
|
331 |
-
+ str(round(z.distance, 2))
|
332 |
-
)
|
333 |
-
zipf_summary += (
|
334 |
-
"**. This was fit with a minimum rank value of: **"
|
335 |
-
+ str(int(z.xmin))
|
336 |
-
+ "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
|
337 |
-
)
|
338 |
-
|
339 |
-
alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
|
340 |
-
xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
|
341 |
-
fit_results_table = pd.DataFrame.from_dict(
|
342 |
-
{
|
343 |
-
r"Alpha:": [str("%.2f" % z.alpha)],
|
344 |
-
"KS distance:": [str("%.2f" % z.distance)],
|
345 |
-
"Min rank:": [str("%s" % int(z.xmin))],
|
346 |
-
},
|
347 |
-
columns=["Results"],
|
348 |
-
orient="index",
|
349 |
-
)
|
350 |
-
fit_results_table.index.name = column_id
|
351 |
with st.expander(
|
352 |
f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
|
353 |
):
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
"""
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
st.markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
|
390 |
### Finally finally finally, show nPMI stuff.
|
@@ -427,7 +430,7 @@ def npmi_widget(npmi_stats, min_vocab, column_id):
|
|
427 |
|
428 |
def npmi_show(paired_results):
|
429 |
if paired_results.empty:
|
430 |
-
st.markdown("No words that co-occur enough times for results! Or there's a 🐛.")
|
431 |
else:
|
432 |
s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
|
433 |
# s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
|
|
|
319 |
|
320 |
### Finally, show Zipf stuff
|
321 |
def expander_zipf(z, zipf_fig, column_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
with st.expander(
|
323 |
f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
|
324 |
):
|
325 |
+
try:
|
326 |
+
_ZIPF_CAPTION = """This shows how close the observed language is to an ideal
|
327 |
+
natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
|
328 |
+
calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
|
329 |
+
|
330 |
+
powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
|
331 |
+
zipf_summary = (
|
332 |
+
"The optimal alpha based on this dataset is: **"
|
333 |
+
+ str(round(z.alpha, 2))
|
334 |
+
+ "**, with a KS distance of: **"
|
335 |
+
+ str(round(z.distance, 2))
|
336 |
+
)
|
337 |
+
zipf_summary += (
|
338 |
+
"**. This was fit with a minimum rank value of: **"
|
339 |
+
+ str(int(z.xmin))
|
340 |
+
+ "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
|
341 |
+
)
|
342 |
+
|
343 |
+
alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
|
344 |
+
xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
|
345 |
+
fit_results_table = pd.DataFrame.from_dict(
|
346 |
+
{
|
347 |
+
r"Alpha:": [str("%.2f" % z.alpha)],
|
348 |
+
"KS distance:": [str("%.2f" % z.distance)],
|
349 |
+
"Min rank:": [str("%s" % int(z.xmin))],
|
350 |
+
},
|
351 |
+
columns=["Results"],
|
352 |
+
orient="index",
|
353 |
+
)
|
354 |
+
fit_results_table.index.name = column_id
|
355 |
+
st.caption(
|
356 |
+
"Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
|
357 |
+
)
|
358 |
+
st.markdown(_ZIPF_CAPTION)
|
359 |
+
st.write(
|
360 |
+
"""
|
361 |
+
A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
|
362 |
+
with an ideal α value of 1."""
|
363 |
+
)
|
364 |
+
st.markdown(
|
365 |
+
"In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
|
366 |
+
)
|
367 |
+
st.markdown(
|
368 |
+
"Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
|
369 |
+
)
|
370 |
+
st.markdown("-----")
|
371 |
+
st.write("### Here is your dataset's Zipf results:")
|
372 |
+
st.dataframe(fit_results_table)
|
373 |
+
st.write(zipf_summary)
|
374 |
+
# TODO: Nice UI version of the content in the comments.
|
375 |
+
# st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
|
376 |
+
# if z.ks_test.pvalue < 0.01:
|
377 |
+
# st.markdown(
|
378 |
+
# "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
|
379 |
+
# else:
|
380 |
+
# st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
|
381 |
+
# st.markdown("Checking the goodness of fit of our observed distribution")
|
382 |
+
# st.markdown("to the hypothesized power law distribution")
|
383 |
+
# st.markdown("using a Kolmogorov–Smirnov (KS) test.")
|
384 |
+
st.plotly_chart(zipf_fig, use_container_width=True)
|
385 |
+
if z.alpha > 2:
|
386 |
+
st.markdown(alpha_warning)
|
387 |
+
if z.xmin > 5:
|
388 |
+
st.markdown(xmin_warning)
|
389 |
+
except:
|
390 |
+
st.write("Under construction!")
|
391 |
|
392 |
|
393 |
### Finally finally finally, show nPMI stuff.
|
|
|
430 |
|
431 |
def npmi_show(paired_results):
|
432 |
if paired_results.empty:
|
433 |
+
st.markdown("No words that co-occur enough times for results! Or there's a 🐛. Or we're still computing this one. 🤷")
|
434 |
else:
|
435 |
s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
|
436 |
# s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
|