Spaces:
Build error
Build error
Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main
Browse files- app.py +52 -19
- cache_dir/c4_en.noblocklist_train_text/fig_tok_length.png +3 -0
- cache_dir/c4_en_train_text/fig_tok_length.png +3 -0
- cache_dir/c4_realnewslike_train_text/fig_tok_length.png +3 -0
- cache_dir/c4_realnewslike_train_text/text_dset/dataset.arrow +3 -0
- cache_dir/c4_realnewslike_train_text/text_dset/dataset_info.json +3 -0
- cache_dir/c4_realnewslike_train_text/text_dset/state.json +3 -0
- cache_dir/squad_plain_text_train_context/fig_tok_length.png +2 -2
- cache_dir/squad_plain_text_train_question/fig_tok_length.png +2 -2
- cache_dir/squad_plain_text_train_title/fig_tok_length.png +2 -2
- cache_dir/squad_plain_text_validation_context/fig_tok_length.png +3 -0
- cache_dir/squad_plain_text_validation_question/fig_tok_length.png +3 -0
- cache_dir/squad_plain_text_validation_title/fig_tok_length.png +3 -0
- cache_dir/squad_v2_squad_v2_train_context/fig_tok_length.png +3 -0
- cache_dir/squad_v2_squad_v2_train_question/fig_tok_length.png +3 -0
- cache_dir/squad_v2_squad_v2_train_title/fig_tok_length.png +3 -0
- cache_dir/squad_v2_squad_v2_validation_context/fig_tok_length.png +3 -0
- cache_dir/squad_v2_squad_v2_validation_question/fig_tok_length.png +3 -0
- cache_dir/squad_v2_squad_v2_validation_title/fig_tok_length.png +3 -0
- cache_dir/super_glue_boolq_test_passage/fig_tok_length.png +3 -0
- cache_dir/super_glue_boolq_test_question/fig_tok_length.png +3 -0
- cache_dir/super_glue_cb_test_hypothesis/fig_tok_length.png +3 -0
- cache_dir/super_glue_cb_test_premise/fig_tok_length.png +3 -0
- cache_dir/super_glue_copa_test_choice1/fig_tok_length.png +3 -0
- cache_dir/super_glue_copa_test_choice2/fig_tok_length.png +3 -0
- cache_dir/super_glue_copa_test_premise/fig_tok_length.png +3 -0
- cache_dir/super_glue_copa_test_question/fig_tok_length.png +3 -0
- cache_dir/wikitext_wikitext-103-raw-v1_test_text/fig_tok_length.png +3 -0
- cache_dir/wikitext_wikitext-103-v1_test_text/fig_tok_length.png +3 -0
- cache_dir/wikitext_wikitext-2-raw-v1_test_text/fig_tok_length.png +3 -0
- cache_dir/wikitext_wikitext-2-v1_test_text/fig_tok_length.png +3 -0
- data_measurements/dataset_statistics.py +9 -8
- data_measurements/streamlit_utils.py +79 -67
- requirements.txt +2 -2
app.py
CHANGED
@@ -117,7 +117,10 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
|
|
117 |
logs.warning("Loading Embeddings")
|
118 |
dstats.load_or_prepare_embeddings()
|
119 |
logs.warning("Loading nPMI")
|
120 |
-
|
|
|
|
|
|
|
121 |
logs.warning("Loading Zipf")
|
122 |
dstats.load_or_prepare_zipf()
|
123 |
return dstats
|
@@ -147,25 +150,55 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
147 |
mkdir(CACHE_DIR)
|
148 |
if use_cache:
|
149 |
logs.warning("Using cache")
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
if show_embeddings:
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
return dstats
|
170 |
|
171 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
|
|
117 |
logs.warning("Loading Embeddings")
|
118 |
dstats.load_or_prepare_embeddings()
|
119 |
logs.warning("Loading nPMI")
|
120 |
+
try:
|
121 |
+
dstats.load_or_prepare_npmi()
|
122 |
+
except:
|
123 |
+
logs.warning("Missing a cache for npmi")
|
124 |
logs.warning("Loading Zipf")
|
125 |
dstats.load_or_prepare_zipf()
|
126 |
return dstats
|
|
|
150 |
mkdir(CACHE_DIR)
|
151 |
if use_cache:
|
152 |
logs.warning("Using cache")
|
153 |
+
try:
|
154 |
+
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
155 |
+
# Don't recalculate; we're live
|
156 |
+
dstats.set_deployment(True)
|
157 |
+
except:
|
158 |
+
logs.warning("We're screwed")
|
159 |
+
try:
|
160 |
+
# We need to have the text_dset loaded for further load_or_prepare
|
161 |
+
dstats.load_or_prepare_dataset()
|
162 |
+
except:
|
163 |
+
logs.warning("Missing a cache for load or prepare dataset")
|
164 |
+
try:
|
165 |
+
# Header widget
|
166 |
+
dstats.load_or_prepare_dset_peek()
|
167 |
+
except:
|
168 |
+
logs.warning("Missing a cache for dset peek")
|
169 |
+
try:
|
170 |
+
# General stats widget
|
171 |
+
dstats.load_or_prepare_general_stats()
|
172 |
+
except:
|
173 |
+
logs.warning("Missing a cache for general stats")
|
174 |
+
try:
|
175 |
+
# Labels widget
|
176 |
+
dstats.load_or_prepare_labels()
|
177 |
+
except:
|
178 |
+
logs.warning("Missing a cache for prepare labels")
|
179 |
+
try:
|
180 |
+
# Text lengths widget
|
181 |
+
dstats.load_or_prepare_text_lengths()
|
182 |
+
except:
|
183 |
+
logs.warning("Missing a cache for text lengths")
|
184 |
if show_embeddings:
|
185 |
+
try:
|
186 |
+
# Embeddings widget
|
187 |
+
dstats.load_or_prepare_embeddings()
|
188 |
+
except:
|
189 |
+
logs.warning("Missing a cache for embeddings")
|
190 |
+
try:
|
191 |
+
dstats.load_or_prepare_text_duplicates()
|
192 |
+
except:
|
193 |
+
logs.warning("Missing a cache for text duplicates")
|
194 |
+
try:
|
195 |
+
dstats.load_or_prepare_npmi()
|
196 |
+
except:
|
197 |
+
logs.warning("Missing a cache for npmi")
|
198 |
+
try:
|
199 |
+
dstats.load_or_prepare_zipf()
|
200 |
+
except:
|
201 |
+
logs.warning("Missing a cache for zipf")
|
202 |
return dstats
|
203 |
|
204 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
cache_dir/c4_en.noblocklist_train_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/c4_en_train_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/c4_realnewslike_train_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/c4_realnewslike_train_text/text_dset/dataset.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9813f70c9be641905ca737aa8f16e29d6aa17155a76cd830e7a627aed91431f4
|
3 |
+
size 529606944
|
cache_dir/c4_realnewslike_train_text/text_dset/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff9f59542efc98b40f23b64408e3fbaed544ad8f0d1fb1e7126ead5af52844ac
|
3 |
+
size 945
|
cache_dir/c4_realnewslike_train_text/text_dset/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2f6884f5ee381e5df2d267dae699aaf4792ba06c8f16830c9c19c144b4b3003
|
3 |
+
size 256
|
cache_dir/squad_plain_text_train_context/fig_tok_length.png
CHANGED
Git LFS Details
|
Git LFS Details
|
cache_dir/squad_plain_text_train_question/fig_tok_length.png
CHANGED
Git LFS Details
|
Git LFS Details
|
cache_dir/squad_plain_text_train_title/fig_tok_length.png
CHANGED
Git LFS Details
|
Git LFS Details
|
cache_dir/squad_plain_text_validation_context/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_plain_text_validation_question/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_plain_text_validation_title/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_v2_squad_v2_train_context/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_v2_squad_v2_train_question/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_v2_squad_v2_train_title/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_v2_squad_v2_validation_context/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_v2_squad_v2_validation_question/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/squad_v2_squad_v2_validation_title/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_boolq_test_passage/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_boolq_test_question/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_cb_test_hypothesis/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_cb_test_premise/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_copa_test_choice1/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_copa_test_choice2/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_copa_test_premise/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/super_glue_copa_test_question/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/wikitext_wikitext-103-raw-v1_test_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/wikitext_wikitext-103-v1_test_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/wikitext_wikitext-2-raw-v1_test_text/fig_tok_length.png
ADDED
Git LFS Details
|
cache_dir/wikitext_wikitext-2-v1_test_text/fig_tok_length.png
ADDED
Git LFS Details
|
data_measurements/dataset_statistics.py
CHANGED
@@ -498,7 +498,7 @@ class DatasetStatisticsCacheClass:
|
|
498 |
if not self.live:
|
499 |
if self.tokenized_df is None:
|
500 |
logs.warning("Tokenized dataset not yet loaded; doing so.")
|
501 |
-
self.
|
502 |
if self.vocab_counts_df is None:
|
503 |
logs.warning("Vocab not yet loaded; doing so.")
|
504 |
self.load_or_prepare_vocab()
|
@@ -544,8 +544,8 @@ class DatasetStatisticsCacheClass:
|
|
544 |
"""
|
545 |
logs.info("Doing text dset.")
|
546 |
self.load_or_prepare_text_dset(save)
|
547 |
-
logs.info("Doing tokenized dataframe")
|
548 |
-
self.load_or_prepare_tokenized_df(save)
|
549 |
logs.info("Doing dataset peek")
|
550 |
self.load_or_prepare_dset_peek(save)
|
551 |
|
@@ -554,11 +554,12 @@ class DatasetStatisticsCacheClass:
|
|
554 |
with open(self.dset_peek_json_fid, "r") as f:
|
555 |
self.dset_peek = json.load(f)["dset peek"]
|
556 |
else:
|
557 |
-
if self.
|
558 |
-
self.
|
559 |
-
|
560 |
-
|
561 |
-
|
|
|
562 |
|
563 |
def load_or_prepare_tokenized_df(self, save=True):
|
564 |
if self.use_cache and exists(self.tokenized_df_fid):
|
|
|
498 |
if not self.live:
|
499 |
if self.tokenized_df is None:
|
500 |
logs.warning("Tokenized dataset not yet loaded; doing so.")
|
501 |
+
self.load_or_prepare_tokenized_df()
|
502 |
if self.vocab_counts_df is None:
|
503 |
logs.warning("Vocab not yet loaded; doing so.")
|
504 |
self.load_or_prepare_vocab()
|
|
|
544 |
"""
|
545 |
logs.info("Doing text dset.")
|
546 |
self.load_or_prepare_text_dset(save)
|
547 |
+
#logs.info("Doing tokenized dataframe")
|
548 |
+
#self.load_or_prepare_tokenized_df(save)
|
549 |
logs.info("Doing dataset peek")
|
550 |
self.load_or_prepare_dset_peek(save)
|
551 |
|
|
|
554 |
with open(self.dset_peek_json_fid, "r") as f:
|
555 |
self.dset_peek = json.load(f)["dset peek"]
|
556 |
else:
|
557 |
+
if not self.live:
|
558 |
+
if self.dset is None:
|
559 |
+
self.get_base_dataset()
|
560 |
+
self.dset_peek = self.dset[:100]
|
561 |
+
if save:
|
562 |
+
write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
|
563 |
|
564 |
def load_or_prepare_tokenized_df(self, save=True):
|
565 |
if self.use_cache and exists(self.tokenized_df_fid):
|
data_measurements/streamlit_utils.py
CHANGED
@@ -20,7 +20,7 @@ import streamlit as st
|
|
20 |
from st_aggrid import AgGrid, GridOptionsBuilder
|
21 |
|
22 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
23 |
-
|
24 |
|
25 |
def sidebar_header():
|
26 |
st.sidebar.markdown(
|
@@ -48,7 +48,10 @@ def sidebar_selection(ds_name_to_dict, column_id):
|
|
48 |
)
|
49 |
# choose a config to analyze
|
50 |
ds_configs = ds_name_to_dict[ds_name]
|
51 |
-
|
|
|
|
|
|
|
52 |
config_name = st.selectbox(
|
53 |
f"Choose configuration{column_id}:",
|
54 |
config_names,
|
@@ -319,72 +322,75 @@ def expander_npmi_description(min_vocab):
|
|
319 |
|
320 |
### Finally, show Zipf stuff
|
321 |
def expander_zipf(z, zipf_fig, column_id):
|
322 |
-
_ZIPF_CAPTION = """This shows how close the observed language is to an ideal
|
323 |
-
natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
|
324 |
-
calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
|
325 |
-
|
326 |
-
powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
|
327 |
-
zipf_summary = (
|
328 |
-
"The optimal alpha based on this dataset is: **"
|
329 |
-
+ str(round(z.alpha, 2))
|
330 |
-
+ "**, with a KS distance of: **"
|
331 |
-
+ str(round(z.distance, 2))
|
332 |
-
)
|
333 |
-
zipf_summary += (
|
334 |
-
"**. This was fit with a minimum rank value of: **"
|
335 |
-
+ str(int(z.xmin))
|
336 |
-
+ "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
|
337 |
-
)
|
338 |
-
|
339 |
-
alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
|
340 |
-
xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
|
341 |
-
fit_results_table = pd.DataFrame.from_dict(
|
342 |
-
{
|
343 |
-
r"Alpha:": [str("%.2f" % z.alpha)],
|
344 |
-
"KS distance:": [str("%.2f" % z.distance)],
|
345 |
-
"Min rank:": [str("%s" % int(z.xmin))],
|
346 |
-
},
|
347 |
-
columns=["Results"],
|
348 |
-
orient="index",
|
349 |
-
)
|
350 |
-
fit_results_table.index.name = column_id
|
351 |
with st.expander(
|
352 |
f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
|
353 |
):
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
"""
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
st.markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
|
390 |
### Finally finally finally, show nPMI stuff.
|
@@ -427,17 +433,23 @@ def npmi_widget(npmi_stats, min_vocab, column_id):
|
|
427 |
|
428 |
def npmi_show(paired_results):
|
429 |
if paired_results.empty:
|
430 |
-
st.markdown("No words that co-occur enough times for results! Or there's a 🐛.")
|
431 |
else:
|
432 |
s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
|
433 |
# s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
|
434 |
s.index.name = "word"
|
435 |
npmi_cols = s.filter(like="npmi").columns
|
436 |
count_cols = s.filter(like="count").columns
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
# TODO: This is very different look than the duplicates table above. Should probably standardize.
|
438 |
cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
|
439 |
out_df = (
|
440 |
-
|
441 |
.format(subset=npmi_cols, formatter="{:,.3f}")
|
442 |
.format(subset=count_cols, formatter=int)
|
443 |
.set_properties(
|
|
|
20 |
from st_aggrid import AgGrid, GridOptionsBuilder
|
21 |
|
22 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
23 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
24 |
|
25 |
def sidebar_header():
|
26 |
st.sidebar.markdown(
|
|
|
48 |
)
|
49 |
# choose a config to analyze
|
50 |
ds_configs = ds_name_to_dict[ds_name]
|
51 |
+
if ds_name == "c4":
|
52 |
+
config_names = ['en','en.noblocklist','realnewslike']
|
53 |
+
else:
|
54 |
+
config_names = list(ds_configs.keys())
|
55 |
config_name = st.selectbox(
|
56 |
f"Choose configuration{column_id}:",
|
57 |
config_names,
|
|
|
322 |
|
323 |
### Finally, show Zipf stuff
|
324 |
def expander_zipf(z, zipf_fig, column_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
with st.expander(
|
326 |
f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
|
327 |
):
|
328 |
+
try:
|
329 |
+
_ZIPF_CAPTION = """This shows how close the observed language is to an ideal
|
330 |
+
natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
|
331 |
+
calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
|
332 |
+
|
333 |
+
powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
|
334 |
+
zipf_summary = (
|
335 |
+
"The optimal alpha based on this dataset is: **"
|
336 |
+
+ str(round(z.alpha, 2))
|
337 |
+
+ "**, with a KS distance of: **"
|
338 |
+
+ str(round(z.distance, 2))
|
339 |
+
)
|
340 |
+
zipf_summary += (
|
341 |
+
"**. This was fit with a minimum rank value of: **"
|
342 |
+
+ str(int(z.xmin))
|
343 |
+
+ "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
|
344 |
+
)
|
345 |
+
|
346 |
+
alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
|
347 |
+
xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
|
348 |
+
fit_results_table = pd.DataFrame.from_dict(
|
349 |
+
{
|
350 |
+
r"Alpha:": [str("%.2f" % z.alpha)],
|
351 |
+
"KS distance:": [str("%.2f" % z.distance)],
|
352 |
+
"Min rank:": [str("%s" % int(z.xmin))],
|
353 |
+
},
|
354 |
+
columns=["Results"],
|
355 |
+
orient="index",
|
356 |
+
)
|
357 |
+
fit_results_table.index.name = column_id
|
358 |
+
st.caption(
|
359 |
+
"Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
|
360 |
+
)
|
361 |
+
st.markdown(_ZIPF_CAPTION)
|
362 |
+
st.write(
|
363 |
+
"""
|
364 |
+
A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
|
365 |
+
with an ideal α value of 1."""
|
366 |
+
)
|
367 |
+
st.markdown(
|
368 |
+
"In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
|
369 |
+
)
|
370 |
+
st.markdown(
|
371 |
+
"Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
|
372 |
+
)
|
373 |
+
st.markdown("-----")
|
374 |
+
st.write("### Here is your dataset's Zipf results:")
|
375 |
+
st.dataframe(fit_results_table)
|
376 |
+
st.write(zipf_summary)
|
377 |
+
# TODO: Nice UI version of the content in the comments.
|
378 |
+
# st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
|
379 |
+
# if z.ks_test.pvalue < 0.01:
|
380 |
+
# st.markdown(
|
381 |
+
# "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
|
382 |
+
# else:
|
383 |
+
# st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
|
384 |
+
# st.markdown("Checking the goodness of fit of our observed distribution")
|
385 |
+
# st.markdown("to the hypothesized power law distribution")
|
386 |
+
# st.markdown("using a Kolmogorov–Smirnov (KS) test.")
|
387 |
+
st.plotly_chart(zipf_fig, use_container_width=True)
|
388 |
+
if z.alpha > 2:
|
389 |
+
st.markdown(alpha_warning)
|
390 |
+
if z.xmin > 5:
|
391 |
+
st.markdown(xmin_warning)
|
392 |
+
except:
|
393 |
+
st.write("Under construction!")
|
394 |
|
395 |
|
396 |
### Finally finally finally, show nPMI stuff.
|
|
|
433 |
|
434 |
def npmi_show(paired_results):
|
435 |
if paired_results.empty:
|
436 |
+
st.markdown("No words that co-occur enough times for results! Or there's a 🐛. Or we're still computing this one. 🤷")
|
437 |
else:
|
438 |
s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
|
439 |
# s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
|
440 |
s.index.name = "word"
|
441 |
npmi_cols = s.filter(like="npmi").columns
|
442 |
count_cols = s.filter(like="count").columns
|
443 |
+
if s.shape[0] > 10000:
|
444 |
+
bias_thres = max(abs(s["npmi-bias"][5000]), abs(s["npmi-bias"][-5000]))
|
445 |
+
print(f"filtering with bias threshold: {bias_thres}")
|
446 |
+
s_filtered = s[s["npmi-bias"].abs() > bias_thres]
|
447 |
+
else:
|
448 |
+
s_filtered = s
|
449 |
# TODO: This is very different look than the duplicates table above. Should probably standardize.
|
450 |
cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
|
451 |
out_df = (
|
452 |
+
s_filtered.style.background_gradient(subset=npmi_cols, cmap=cm)
|
453 |
.format(subset=npmi_cols, formatter="{:,.3f}")
|
454 |
.format(subset=count_cols, formatter=int)
|
455 |
.set_properties(
|
requirements.txt
CHANGED
@@ -10,7 +10,7 @@ iso_639==0.4.5
|
|
10 |
datasets==1.15.1
|
11 |
powerlaw==1.5
|
12 |
numpy==1.19.5
|
13 |
-
pandas==1.
|
14 |
dataclasses==0.6
|
15 |
iso639==0.1.4
|
16 |
python_igraph==0.9.6
|
@@ -23,4 +23,4 @@ numexpr==2.7.3
|
|
23 |
scikit-learn~=0.24.2
|
24 |
scipy~=1.7.3
|
25 |
tqdm~=4.62.3
|
26 |
-
pyarrow~=6.0.1
|
|
|
10 |
datasets==1.15.1
|
11 |
powerlaw==1.5
|
12 |
numpy==1.19.5
|
13 |
+
pandas==1.0.0
|
14 |
dataclasses==0.6
|
15 |
iso639==0.1.4
|
16 |
python_igraph==0.9.6
|
|
|
23 |
scikit-learn~=0.24.2
|
24 |
scipy~=1.7.3
|
25 |
tqdm~=4.62.3
|
26 |
+
pyarrow~=6.0.1
|