Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg-huggingface commited on Dec 7, 2021

Commit

14e5c2a

•

1 Parent(s): 1a4c18a

Try..except catching for errors

Browse files

Files changed (2) hide show

app.py +48 -18
data_measurements/streamlit_utils.py +67 -64

app.py CHANGED Viewed

@@ -150,25 +150,55 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
         mkdir(CACHE_DIR)
     if use_cache:
         logs.warning("Using cache")
-    dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
-    # Don't recalculate; we're live
-    dstats.set_deployment(True)
-    # We need to have the text_dset loaded for further load_or_prepare
-    dstats.load_or_prepare_dataset()
-    # Header widget
-    dstats.load_or_prepare_dset_peek()
-    # General stats widget
-    dstats.load_or_prepare_general_stats()
-    # Labels widget
-    dstats.load_or_prepare_labels()
-    # Text lengths widget
-    dstats.load_or_prepare_text_lengths()
     if show_embeddings:
-        # Embeddings widget
-        dstats.load_or_prepare_embeddings()
-    dstats.load_or_prepare_text_duplicates()
-    dstats.load_or_prepare_npmi()
-    dstats.load_or_prepare_zipf()
     return dstats
 def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):

         mkdir(CACHE_DIR)
     if use_cache:
         logs.warning("Using cache")
+    try:
+        dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+        # Don't recalculate; we're live
+        dstats.set_deployment(True)
+    except:
+        logs.warning("We're screwed")
+    try:
+        # We need to have the text_dset loaded for further load_or_prepare
+        dstats.load_or_prepare_dataset()
+    except:
+        logs.warning("Missing a cache for load or prepare dataset")
+    try:
+        # Header widget
+        dstats.load_or_prepare_dset_peek()
+    except:
+        logs.warning("Missing a cache for dset peek")
+    try:
+        # General stats widget
+        dstats.load_or_prepare_general_stats()
+    except:
+        logs.warning("Missing a cache for general stats")
+    try:
+        # Labels widget
+        dstats.load_or_prepare_labels()
+    except:
+        logs.warning("Missing a cache for prepare labels")
+    try:
+        # Text lengths widget
+        dstats.load_or_prepare_text_lengths()
+    except:
+        logs.warning("Missing a cache for text lengths")
     if show_embeddings:
+        try:
+            # Embeddings widget
+            dstats.load_or_prepare_embeddings()
+        except:
+            logs.warning("Missing a cache for embeddings")
+    try:
+        dstats.load_or_prepare_text_duplicates()
+    except:
+        logs.warning("Missing a cache for text duplicates")
+    try:
+        dstats.load_or_prepare_npmi()
+    except:
+        logs.warning("Missing a cache for npmi")
+    try:
+        dstats.load_or_prepare_zipf()
+    except:
+        logs.warning("Missing a cache for zipf")
     return dstats
 def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -319,72 +319,75 @@ def expander_npmi_description(min_vocab):
 ### Finally, show Zipf stuff
 def expander_zipf(z, zipf_fig, column_id):
-    _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
-    natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
-    calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
-    powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
-    zipf_summary = (
-        "The optimal alpha based on this dataset is: **"
-        + str(round(z.alpha, 2))
-        + "**, with a KS distance of: **"
-        + str(round(z.distance, 2))
-    )
-    zipf_summary += (
-        "**.  This was fit with a minimum rank value of: **"
-        + str(int(z.xmin))
-        + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
-    )
-    alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
-    xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
-    fit_results_table = pd.DataFrame.from_dict(
-        {
-            r"Alpha:": [str("%.2f" % z.alpha)],
-            "KS distance:": [str("%.2f" % z.distance)],
-            "Min rank:": [str("%s" % int(z.xmin))],
-        },
-        columns=["Results"],
-        orient="index",
-    )
-    fit_results_table.index.name = column_id
     with st.expander(
         f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
     ):
-        st.caption(
-            "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
-        )
-        st.markdown(_ZIPF_CAPTION)
-        st.write(
-            """
-        A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
-with an ideal α value of 1."""
-        )
-        st.markdown(
-            "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
-        )
-        st.markdown(
-            "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
-        )
-        st.markdown("-----")
-        st.write("### Here is your dataset's Zipf results:")
-        st.dataframe(fit_results_table)
-        st.write(zipf_summary)
-        # TODO: Nice UI version of the content in the comments.
-        # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
-        # if z.ks_test.pvalue < 0.01:
-        #    st.markdown(
-        #        "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
-        # else:
-        #    st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
-        # st.markdown("Checking the goodness of fit of our observed distribution")
-        # st.markdown("to the hypothesized power law distribution")
-        # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
-        st.plotly_chart(zipf_fig, use_container_width=True)
-        if z.alpha > 2:
-            st.markdown(alpha_warning)
-        if z.xmin > 5:
-            st.markdown(xmin_warning)
 ### Finally finally finally, show nPMI stuff.
@@ -427,7 +430,7 @@ def npmi_widget(npmi_stats, min_vocab, column_id):
 def npmi_show(paired_results):
     if paired_results.empty:
-        st.markdown("No words that co-occur enough times for results!  Or there's a 🐛.")
     else:
         s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
         # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])

 ### Finally, show Zipf stuff
 def expander_zipf(z, zipf_fig, column_id):
     with st.expander(
         f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
     ):
+        try:
+            _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
+            natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
+            calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
+            powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
+            zipf_summary = (
+                    "The optimal alpha based on this dataset is: **"
+                    + str(round(z.alpha, 2))
+                    + "**, with a KS distance of: **"
+                    + str(round(z.distance, 2))
+            )
+            zipf_summary += (
+                    "**.  This was fit with a minimum rank value of: **"
+                    + str(int(z.xmin))
+                    + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
+            )
+            alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
+            xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
+            fit_results_table = pd.DataFrame.from_dict(
+                {
+                    r"Alpha:": [str("%.2f" % z.alpha)],
+                    "KS distance:": [str("%.2f" % z.distance)],
+                    "Min rank:": [str("%s" % int(z.xmin))],
+                },
+                columns=["Results"],
+                orient="index",
+            )
+            fit_results_table.index.name = column_id
+            st.caption(
+                "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
+            )
+            st.markdown(_ZIPF_CAPTION)
+            st.write(
+                """
+            A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
+    with an ideal α value of 1."""
+            )
+            st.markdown(
+                "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
+            )
+            st.markdown(
+                "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
+            )
+            st.markdown("-----")
+            st.write("### Here is your dataset's Zipf results:")
+            st.dataframe(fit_results_table)
+            st.write(zipf_summary)
+            # TODO: Nice UI version of the content in the comments.
+            # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
+            # if z.ks_test.pvalue < 0.01:
+            #    st.markdown(
+            #        "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
+            # else:
+            #    st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
+            # st.markdown("Checking the goodness of fit of our observed distribution")
+            # st.markdown("to the hypothesized power law distribution")
+            # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
+            st.plotly_chart(zipf_fig, use_container_width=True)
+            if z.alpha > 2:
+                st.markdown(alpha_warning)
+            if z.xmin > 5:
+                st.markdown(xmin_warning)
+        except:
+            st.write("Under construction!")
 ### Finally finally finally, show nPMI stuff.
 def npmi_show(paired_results):
     if paired_results.empty:
+        st.markdown("No words that co-occur enough times for results!  Or there's a 🐛.  Or we're still computing this one. 🤷")
     else:
         s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
         # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])