Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg HF staff commited on Dec 7, 2021

Commit

cbbc827

•

2 Parent(s): 8c830d7 86d20f7

Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main

Browse files

Files changed (34) hide show

app.py +52 -19
cache_dir/c4_en.noblocklist_train_text/fig_tok_length.png +3 -0
cache_dir/c4_en_train_text/fig_tok_length.png +3 -0
cache_dir/c4_realnewslike_train_text/fig_tok_length.png +3 -0
cache_dir/c4_realnewslike_train_text/text_dset/dataset.arrow +3 -0
cache_dir/c4_realnewslike_train_text/text_dset/dataset_info.json +3 -0
cache_dir/c4_realnewslike_train_text/text_dset/state.json +3 -0
cache_dir/squad_plain_text_train_context/fig_tok_length.png +2 -2
cache_dir/squad_plain_text_train_question/fig_tok_length.png +2 -2
cache_dir/squad_plain_text_train_title/fig_tok_length.png +2 -2
cache_dir/squad_plain_text_validation_context/fig_tok_length.png +3 -0
cache_dir/squad_plain_text_validation_question/fig_tok_length.png +3 -0
cache_dir/squad_plain_text_validation_title/fig_tok_length.png +3 -0
cache_dir/squad_v2_squad_v2_train_context/fig_tok_length.png +3 -0
cache_dir/squad_v2_squad_v2_train_question/fig_tok_length.png +3 -0
cache_dir/squad_v2_squad_v2_train_title/fig_tok_length.png +3 -0
cache_dir/squad_v2_squad_v2_validation_context/fig_tok_length.png +3 -0
cache_dir/squad_v2_squad_v2_validation_question/fig_tok_length.png +3 -0
cache_dir/squad_v2_squad_v2_validation_title/fig_tok_length.png +3 -0
cache_dir/super_glue_boolq_test_passage/fig_tok_length.png +3 -0
cache_dir/super_glue_boolq_test_question/fig_tok_length.png +3 -0
cache_dir/super_glue_cb_test_hypothesis/fig_tok_length.png +3 -0
cache_dir/super_glue_cb_test_premise/fig_tok_length.png +3 -0
cache_dir/super_glue_copa_test_choice1/fig_tok_length.png +3 -0
cache_dir/super_glue_copa_test_choice2/fig_tok_length.png +3 -0
cache_dir/super_glue_copa_test_premise/fig_tok_length.png +3 -0
cache_dir/super_glue_copa_test_question/fig_tok_length.png +3 -0
cache_dir/wikitext_wikitext-103-raw-v1_test_text/fig_tok_length.png +3 -0
cache_dir/wikitext_wikitext-103-v1_test_text/fig_tok_length.png +3 -0
cache_dir/wikitext_wikitext-2-raw-v1_test_text/fig_tok_length.png +3 -0
cache_dir/wikitext_wikitext-2-v1_test_text/fig_tok_length.png +3 -0
data_measurements/dataset_statistics.py +9 -8
data_measurements/streamlit_utils.py +79 -67
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -117,7 +117,10 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
         logs.warning("Loading Embeddings")
         dstats.load_or_prepare_embeddings()
     logs.warning("Loading nPMI")
-    dstats.load_or_prepare_npmi()
     logs.warning("Loading Zipf")
     dstats.load_or_prepare_zipf()
     return dstats
@@ -147,25 +150,55 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
         mkdir(CACHE_DIR)
     if use_cache:
         logs.warning("Using cache")
-    dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
-    # Don't recalculate; we're live
-    dstats.set_deployment(True)
-    # We need to have the text_dset loaded for further load_or_prepare
-    dstats.load_or_prepare_dataset()
-    # Header widget
-    dstats.load_or_prepare_dset_peek()
-    # General stats widget
-    dstats.load_or_prepare_general_stats()
-    # Labels widget
-    dstats.load_or_prepare_labels()
-    # Text lengths widget
-    dstats.load_or_prepare_text_lengths()
     if show_embeddings:
-        # Embeddings widget
-        dstats.load_or_prepare_embeddings()
-    dstats.load_or_prepare_text_duplicates()
-    dstats.load_or_prepare_npmi()
-    dstats.load_or_prepare_zipf()
     return dstats
 def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):

         logs.warning("Loading Embeddings")
         dstats.load_or_prepare_embeddings()
     logs.warning("Loading nPMI")
+    try:
+        dstats.load_or_prepare_npmi()
+    except:
+        logs.warning("Missing a cache for npmi")
     logs.warning("Loading Zipf")
     dstats.load_or_prepare_zipf()
     return dstats
         mkdir(CACHE_DIR)
     if use_cache:
         logs.warning("Using cache")
+    try:
+        dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+        # Don't recalculate; we're live
+        dstats.set_deployment(True)
+    except:
+        logs.warning("We're screwed")
+    try:
+        # We need to have the text_dset loaded for further load_or_prepare
+        dstats.load_or_prepare_dataset()
+    except:
+        logs.warning("Missing a cache for load or prepare dataset")
+    try:
+        # Header widget
+        dstats.load_or_prepare_dset_peek()
+    except:
+        logs.warning("Missing a cache for dset peek")
+    try:
+        # General stats widget
+        dstats.load_or_prepare_general_stats()
+    except:
+        logs.warning("Missing a cache for general stats")
+    try:
+        # Labels widget
+        dstats.load_or_prepare_labels()
+    except:
+        logs.warning("Missing a cache for prepare labels")
+    try:
+        # Text lengths widget
+        dstats.load_or_prepare_text_lengths()
+    except:
+        logs.warning("Missing a cache for text lengths")
     if show_embeddings:
+        try:
+            # Embeddings widget
+            dstats.load_or_prepare_embeddings()
+        except:
+            logs.warning("Missing a cache for embeddings")
+    try:
+        dstats.load_or_prepare_text_duplicates()
+    except:
+        logs.warning("Missing a cache for text duplicates")
+    try:
+        dstats.load_or_prepare_npmi()
+    except:
+        logs.warning("Missing a cache for npmi")
+    try:
+        dstats.load_or_prepare_zipf()
+    except:
+        logs.warning("Missing a cache for zipf")
     return dstats
 def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):

cache_dir/c4_en.noblocklist_train_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 7cc045494f55ee52d94ebcae05ec2d936d4136a09fa7a01a4f854172352be843
Pointer size: 130 Bytes
Size of remote file: 38.3 kB

cache_dir/c4_en_train_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 9f53f89f4b7934746143833e287fbf2b616743bacba8d921db9d6a6bf8a6b62d
Pointer size: 130 Bytes
Size of remote file: 40.1 kB

cache_dir/c4_realnewslike_train_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 41d2915522c9b64ba7d8f975f52fda601030d86b9aa0767f0ea8a9439770468a
Pointer size: 130 Bytes
Size of remote file: 43.1 kB

cache_dir/c4_realnewslike_train_text/text_dset/dataset.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9813f70c9be641905ca737aa8f16e29d6aa17155a76cd830e7a627aed91431f4
+size 529606944

cache_dir/c4_realnewslike_train_text/text_dset/dataset_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff9f59542efc98b40f23b64408e3fbaed544ad8f0d1fb1e7126ead5af52844ac
+size 945

cache_dir/c4_realnewslike_train_text/text_dset/state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2f6884f5ee381e5df2d267dae699aaf4792ba06c8f16830c9c19c144b4b3003
+size 256

cache_dir/squad_plain_text_train_context/fig_tok_length.png CHANGED Viewed

Git LFS Details

SHA256: d4ce94e5f9b40891bba84cae64842bfb3614d165c4c4c90931c20c558656a794
Pointer size: 130 Bytes
Size of remote file: 57.3 kB

Git LFS Details

SHA256: 79e07ea05db8bd40a31eb8d114e3a3f250d9282740c9aadfe22bdfd440cfca58
Pointer size: 130 Bytes
Size of remote file: 57.7 kB

cache_dir/squad_plain_text_train_question/fig_tok_length.png CHANGED Viewed

Git LFS Details

SHA256: 3a39496ae7d51b6472a3809431859d6822860dbe64bad60609b79f04ffc13f46
Pointer size: 130 Bytes
Size of remote file: 54 kB

Git LFS Details

SHA256: 760db98db07e10e3019114fc0b48582e2afecfc31378b6672e0d13450a19774f
Pointer size: 130 Bytes
Size of remote file: 53.9 kB

cache_dir/squad_plain_text_train_title/fig_tok_length.png CHANGED Viewed

Git LFS Details

SHA256: bf9fd7fd51590ec14b88cf6c9f3300f99c8e1fc4c98d45ee65c8d056db235e19
Pointer size: 130 Bytes
Size of remote file: 38.9 kB

Git LFS Details

SHA256: 5e98b54a1a91f4b83fd07600c3b1981c10c73312e8e1e1426bdb10ce79c5c5cf
Pointer size: 130 Bytes
Size of remote file: 38.8 kB

cache_dir/squad_plain_text_validation_context/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 889be8dd2dcc65d0d6df490b874bd1c16c33807fb042815b831a40a64ee98bfa
Pointer size: 130 Bytes
Size of remote file: 59.4 kB

cache_dir/squad_plain_text_validation_question/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: d788d31d408a01443c3ae962c47e7da6a586e1d8257d39be7c044369190a1209
Pointer size: 130 Bytes
Size of remote file: 45.9 kB

cache_dir/squad_plain_text_validation_title/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: f011b73e9ee97d39bcc43ff7bd3f6839e34c10694c54a56ee8f0e475eca308e2
Pointer size: 130 Bytes
Size of remote file: 30 kB

cache_dir/squad_v2_squad_v2_train_context/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: d4fdf0874dc9ea371fb28570b84cb766f6cc8e4244b4e43695a7f6d098ac556f
Pointer size: 130 Bytes
Size of remote file: 54.9 kB

cache_dir/squad_v2_squad_v2_train_question/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 45f0292106c6f28c094cd8647a4a4aa0965d025570b282e5def599345f0c2367
Pointer size: 130 Bytes
Size of remote file: 59.6 kB

cache_dir/squad_v2_squad_v2_train_title/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: d9f8c12053922ae904fe380032b6fbe93956bd208239e6795283b2cc8f7ff8cb
Pointer size: 130 Bytes
Size of remote file: 35 kB

cache_dir/squad_v2_squad_v2_validation_context/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 5b4d13d9f9bf0ce0e1066510352ee73f73ae3a919577a4c7542e2025461c4e5c
Pointer size: 130 Bytes
Size of remote file: 54.6 kB

cache_dir/squad_v2_squad_v2_validation_question/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 121712ba1a7020e57b9f8c1771af5104db2d6176d00641fbd557886fd35249ef
Pointer size: 130 Bytes
Size of remote file: 47.5 kB

cache_dir/squad_v2_squad_v2_validation_title/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 5669c412ee49eeb6d1d8c2db7908187708663f316f4e71d3aaf51b594527ac25
Pointer size: 130 Bytes
Size of remote file: 32.1 kB

cache_dir/super_glue_boolq_test_passage/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 4879f362469fa91a268ae956bf7b6eb3b7eaed3999fc5216107a959b37f9a61e
Pointer size: 130 Bytes
Size of remote file: 55.6 kB

cache_dir/super_glue_boolq_test_question/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 830d91aeabc6b623cacee45562d6f52d266c67dba01354b5d1f3c546e09986a8
Pointer size: 130 Bytes
Size of remote file: 40.7 kB

cache_dir/super_glue_cb_test_hypothesis/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 94e5b725f3a12b5e2a38f98d2e2aa4c310fd2fa7747525e53f9a465bf5488f5d
Pointer size: 130 Bytes
Size of remote file: 37.4 kB

cache_dir/super_glue_cb_test_premise/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: a61f51f3fc8e2d0aee773e98c0fb31ca38d79f22efa16a2b74da829c6d20bba6
Pointer size: 130 Bytes
Size of remote file: 40.8 kB

cache_dir/super_glue_copa_test_choice1/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 498d1a103ef9cedf4438d9ebe6b0b5cf1f68d61ec1c2037898942143eb5b8b11
Pointer size: 130 Bytes
Size of remote file: 34 kB

cache_dir/super_glue_copa_test_choice2/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: ff2e28dc98ae6b7c8c76d70274fb8aa6bd97efb0ee75ba4ccc36e07d5d751f59
Pointer size: 130 Bytes
Size of remote file: 34.7 kB

cache_dir/super_glue_copa_test_premise/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 80072ea80d10e77bfce285976ddb734316278e348738245c2ab1a530e7f1ff7a
Pointer size: 130 Bytes
Size of remote file: 34.5 kB

cache_dir/super_glue_copa_test_question/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 577503448bdd44845edf1ba231c346babcef16d63c69ac331a112ad500cd3567
Pointer size: 130 Bytes
Size of remote file: 23.8 kB

cache_dir/wikitext_wikitext-103-raw-v1_test_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 051870fd6d7ee89f5e6888c562303ffbc330b3aecf3957e4f8a7f53eee9cf9b0
Pointer size: 130 Bytes
Size of remote file: 38.6 kB

cache_dir/wikitext_wikitext-103-v1_test_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 3200e3dae5ac8e34146940c6063c605d1c7958d625e9647d9c73b4a5f922067b
Pointer size: 130 Bytes
Size of remote file: 38.5 kB

cache_dir/wikitext_wikitext-2-raw-v1_test_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: 051870fd6d7ee89f5e6888c562303ffbc330b3aecf3957e4f8a7f53eee9cf9b0
Pointer size: 130 Bytes
Size of remote file: 38.6 kB

cache_dir/wikitext_wikitext-2-v1_test_text/fig_tok_length.png ADDED Viewed

Git LFS Details

SHA256: c0be97060b6e59db4de6c8c8435bb35dbd30a153128b66080fbfcbcce0098a59
Pointer size: 130 Bytes
Size of remote file: 38.5 kB

data_measurements/dataset_statistics.py CHANGED Viewed

@@ -498,7 +498,7 @@ class DatasetStatisticsCacheClass:
         if not self.live:
             if self.tokenized_df is None:
                 logs.warning("Tokenized dataset not yet loaded; doing so.")
-                self.load_or_prepare_dataset()
             if self.vocab_counts_df is None:
                 logs.warning("Vocab not yet loaded; doing so.")
                 self.load_or_prepare_vocab()
@@ -544,8 +544,8 @@ class DatasetStatisticsCacheClass:
         """
         logs.info("Doing text dset.")
         self.load_or_prepare_text_dset(save)
-        logs.info("Doing tokenized dataframe")
-        self.load_or_prepare_tokenized_df(save)
         logs.info("Doing dataset peek")
         self.load_or_prepare_dset_peek(save)
@@ -554,11 +554,12 @@ class DatasetStatisticsCacheClass:
             with open(self.dset_peek_json_fid, "r") as f:
                 self.dset_peek = json.load(f)["dset peek"]
         else:
-            if self.dset is None:
-                self.get_base_dataset()
-            self.dset_peek = self.dset[:100]
-            if save:
-                write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
     def load_or_prepare_tokenized_df(self, save=True):
         if self.use_cache and exists(self.tokenized_df_fid):

         if not self.live:
             if self.tokenized_df is None:
                 logs.warning("Tokenized dataset not yet loaded; doing so.")
+                self.load_or_prepare_tokenized_df()
             if self.vocab_counts_df is None:
                 logs.warning("Vocab not yet loaded; doing so.")
                 self.load_or_prepare_vocab()
         """
         logs.info("Doing text dset.")
         self.load_or_prepare_text_dset(save)
+        #logs.info("Doing tokenized dataframe")
+        #self.load_or_prepare_tokenized_df(save)
         logs.info("Doing dataset peek")
         self.load_or_prepare_dset_peek(save)
             with open(self.dset_peek_json_fid, "r") as f:
                 self.dset_peek = json.load(f)["dset peek"]
         else:
+            if not self.live:
+                if self.dset is None:
+                    self.get_base_dataset()
+                self.dset_peek = self.dset[:100]
+                if save:
+                    write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
     def load_or_prepare_tokenized_df(self, save=True):
         if self.use_cache and exists(self.tokenized_df_fid):

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -20,7 +20,7 @@ import streamlit as st
 from st_aggrid import AgGrid, GridOptionsBuilder
 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
 def sidebar_header():
     st.sidebar.markdown(
@@ -48,7 +48,10 @@ def sidebar_selection(ds_name_to_dict, column_id):
         )
         # choose a config to analyze
         ds_configs = ds_name_to_dict[ds_name]
-        config_names = list(ds_configs.keys())
         config_name = st.selectbox(
             f"Choose configuration{column_id}:",
             config_names,
@@ -319,72 +322,75 @@ def expander_npmi_description(min_vocab):
 ### Finally, show Zipf stuff
 def expander_zipf(z, zipf_fig, column_id):
-    _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
-    natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
-    calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
-    powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
-    zipf_summary = (
-        "The optimal alpha based on this dataset is: **"
-        + str(round(z.alpha, 2))
-        + "**, with a KS distance of: **"
-        + str(round(z.distance, 2))
-    )
-    zipf_summary += (
-        "**.  This was fit with a minimum rank value of: **"
-        + str(int(z.xmin))
-        + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
-    )
-    alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
-    xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
-    fit_results_table = pd.DataFrame.from_dict(
-        {
-            r"Alpha:": [str("%.2f" % z.alpha)],
-            "KS distance:": [str("%.2f" % z.distance)],
-            "Min rank:": [str("%s" % int(z.xmin))],
-        },
-        columns=["Results"],
-        orient="index",
-    )
-    fit_results_table.index.name = column_id
     with st.expander(
         f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
     ):
-        st.caption(
-            "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
-        )
-        st.markdown(_ZIPF_CAPTION)
-        st.write(
-            """
-        A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
-with an ideal α value of 1."""
-        )
-        st.markdown(
-            "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
-        )
-        st.markdown(
-            "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
-        )
-        st.markdown("-----")
-        st.write("### Here is your dataset's Zipf results:")
-        st.dataframe(fit_results_table)
-        st.write(zipf_summary)
-        # TODO: Nice UI version of the content in the comments.
-        # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
-        # if z.ks_test.pvalue < 0.01:
-        #    st.markdown(
-        #        "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
-        # else:
-        #    st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
-        # st.markdown("Checking the goodness of fit of our observed distribution")
-        # st.markdown("to the hypothesized power law distribution")
-        # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
-        st.plotly_chart(zipf_fig, use_container_width=True)
-        if z.alpha > 2:
-            st.markdown(alpha_warning)
-        if z.xmin > 5:
-            st.markdown(xmin_warning)
 ### Finally finally finally, show nPMI stuff.
@@ -427,17 +433,23 @@ def npmi_widget(npmi_stats, min_vocab, column_id):
 def npmi_show(paired_results):
     if paired_results.empty:
-        st.markdown("No words that co-occur enough times for results!  Or there's a 🐛.")
     else:
         s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
         # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
         s.index.name = "word"
         npmi_cols = s.filter(like="npmi").columns
         count_cols = s.filter(like="count").columns
         # TODO: This is very different look than the duplicates table above. Should probably standardize.
         cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
         out_df = (
-            s.style.background_gradient(subset=npmi_cols, cmap=cm)
             .format(subset=npmi_cols, formatter="{:,.3f}")
             .format(subset=count_cols, formatter=int)
             .set_properties(

 from st_aggrid import AgGrid, GridOptionsBuilder
 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
+st.set_option('deprecation.showPyplotGlobalUse', False)
 def sidebar_header():
     st.sidebar.markdown(
         )
         # choose a config to analyze
         ds_configs = ds_name_to_dict[ds_name]
+        if ds_name == "c4":
+            config_names = ['en','en.noblocklist','realnewslike']
+        else:
+            config_names = list(ds_configs.keys())
         config_name = st.selectbox(
             f"Choose configuration{column_id}:",
             config_names,
 ### Finally, show Zipf stuff
 def expander_zipf(z, zipf_fig, column_id):
     with st.expander(
         f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
     ):
+        try:
+            _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
+            natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
+            calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
+            powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
+            zipf_summary = (
+                    "The optimal alpha based on this dataset is: **"
+                    + str(round(z.alpha, 2))
+                    + "**, with a KS distance of: **"
+                    + str(round(z.distance, 2))
+            )
+            zipf_summary += (
+                    "**.  This was fit with a minimum rank value of: **"
+                    + str(int(z.xmin))
+                    + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
+            )
+            alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
+            xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
+            fit_results_table = pd.DataFrame.from_dict(
+                {
+                    r"Alpha:": [str("%.2f" % z.alpha)],
+                    "KS distance:": [str("%.2f" % z.distance)],
+                    "Min rank:": [str("%s" % int(z.xmin))],
+                },
+                columns=["Results"],
+                orient="index",
+            )
+            fit_results_table.index.name = column_id
+            st.caption(
+                "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
+            )
+            st.markdown(_ZIPF_CAPTION)
+            st.write(
+                """
+            A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
+    with an ideal α value of 1."""
+            )
+            st.markdown(
+                "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
+            )
+            st.markdown(
+                "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
+            )
+            st.markdown("-----")
+            st.write("### Here is your dataset's Zipf results:")
+            st.dataframe(fit_results_table)
+            st.write(zipf_summary)
+            # TODO: Nice UI version of the content in the comments.
+            # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
+            # if z.ks_test.pvalue < 0.01:
+            #    st.markdown(
+            #        "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
+            # else:
+            #    st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
+            # st.markdown("Checking the goodness of fit of our observed distribution")
+            # st.markdown("to the hypothesized power law distribution")
+            # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
+            st.plotly_chart(zipf_fig, use_container_width=True)
+            if z.alpha > 2:
+                st.markdown(alpha_warning)
+            if z.xmin > 5:
+                st.markdown(xmin_warning)
+        except:
+            st.write("Under construction!")
 ### Finally finally finally, show nPMI stuff.
 def npmi_show(paired_results):
     if paired_results.empty:
+        st.markdown("No words that co-occur enough times for results!  Or there's a 🐛.  Or we're still computing this one. 🤷")
     else:
         s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
         # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
         s.index.name = "word"
         npmi_cols = s.filter(like="npmi").columns
         count_cols = s.filter(like="count").columns
+        if s.shape[0] > 10000:
+            bias_thres = max(abs(s["npmi-bias"][5000]), abs(s["npmi-bias"][-5000]))
+            print(f"filtering with bias threshold: {bias_thres}")
+            s_filtered = s[s["npmi-bias"].abs() > bias_thres]
+        else:
+            s_filtered = s
         # TODO: This is very different look than the duplicates table above. Should probably standardize.
         cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
         out_df = (
+            s_filtered.style.background_gradient(subset=npmi_cols, cmap=cm)
             .format(subset=npmi_cols, formatter="{:,.3f}")
             .format(subset=count_cols, formatter=int)
             .set_properties(

requirements.txt CHANGED Viewed

@@ -10,7 +10,7 @@ iso_639==0.4.5
 datasets==1.15.1
 powerlaw==1.5
 numpy==1.19.5
-pandas==1.3.0
 dataclasses==0.6
 iso639==0.1.4
 python_igraph==0.9.6
@@ -23,4 +23,4 @@ numexpr==2.7.3
 scikit-learn~=0.24.2
 scipy~=1.7.3
 tqdm~=4.62.3
-pyarrow~=6.0.1

 datasets==1.15.1
 powerlaw==1.5
 numpy==1.19.5
+pandas==1.0.0
 dataclasses==0.6
 iso639==0.1.4
 python_igraph==0.9.6
 scikit-learn~=0.24.2
 scipy~=1.7.3
 tqdm~=4.62.3
+pyarrow~=6.0.1