Spaces:

Ezi
/

occurrences_test

Sleeping

App Files Files Community

Ezi Ozoani commited on Sep 26, 2023

Commit

b69fb1e

•

1 Parent(s): 616ba36

test upload

Browse files

Files changed (44) hide show

.ipynb_checkpoints/app (2)-checkpoint.py +296 -0
Scripts/run.sh +112 -0
app.py +256 -0
data_measurements/__init__.py +0 -0
data_measurements/__pycache__/__init__.cpython-310.pyc +0 -0
data_measurements/__pycache__/__init__.cpython-311.pyc +0 -0
data_measurements/__pycache__/dataset_statistics.cpython-310.pyc +0 -0
data_measurements/__pycache__/dataset_statistics.cpython-311.pyc +0 -0
data_measurements/__pycache__/dataset_utils.cpython-310.pyc +0 -0
data_measurements/__pycache__/dataset_utils.cpython-311.pyc +0 -0
data_measurements/__pycache__/embeddings.cpython-310.pyc +0 -0
data_measurements/__pycache__/embeddings.cpython-311.pyc +0 -0
data_measurements/__pycache__/npmi.cpython-310.pyc +0 -0
data_measurements/__pycache__/npmi.cpython-311.pyc +0 -0
data_measurements/__pycache__/streamlit_utils.cpython-310.pyc +0 -0
data_measurements/__pycache__/streamlit_utils.cpython-311.pyc +0 -0
data_measurements/__pycache__/zipf.cpython-310.pyc +0 -0
data_measurements/__pycache__/zipf.cpython-311.pyc +0 -0
data_measurements/_pycache_/__init__.cpython-311.pyc +0 -0
data_measurements/_pycache_/__init__.cpython-37.pyc +0 -0
data_measurements/_pycache_/dataset_statistics.cpython-311.pyc +0 -0
data_measurements/_pycache_/dataset_statistics.cpython-37.pyc +0 -0
data_measurements/_pycache_/dataset_utils.cpython-311.pyc +0 -0
data_measurements/_pycache_/dataset_utils.cpython-37.pyc +0 -0
data_measurements/_pycache_/embeddings.cpython-311.pyc +0 -0
data_measurements/_pycache_/embeddings.cpython-37.pyc +0 -0
data_measurements/_pycache_/npmi.cpython-311.pyc +0 -0
data_measurements/_pycache_/npmi.cpython-37.pyc +0 -0
data_measurements/_pycache_/streamlit_utils.cpython-311.pyc +0 -0
data_measurements/_pycache_/zipf.cpython-311.pyc +0 -0
data_measurements/_pycache_/zipf.cpython-37.pyc +0 -0
data_measurements/dataset_statistics.py +1223 -0
data_measurements/dataset_utils.py +296 -0
data_measurements/embeddings.py +550 -0
data_measurements/npmi.py +254 -0
data_measurements/streamlit_utils.py +498 -0
data_measurements/zipf.py +247 -0
log_files/app.log +59 -0
log_files/dataset_statistics.log +4 -0
log_files/npmi.log +0 -0
log_files/zipf.log +0 -0
run.sh +110 -0
run_data_measurements.py +296 -0
temp.jsonl +0 -0

.ipynb_checkpoints/app (2)-checkpoint.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from os import mkdir
+from os.path import exists, isdir
+from pathlib import Path
+# #! pip install streamlit
+import streamlit as st
+# +
+# #! pip install datasets
+# #! pip install powerlaw
+# -
+from data_measurements import dataset_statistics, dataset_utils
+from data_measurements import streamlit_utils as st_utils
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.WARNING)
+logs.propagate = False
+if not logs.handlers:
+    Path('./log_files').mkdir(exist_ok=True)
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/app.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+st.set_page_config(
+    page_title="Demo to showcase dataset metrics",
+    page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+# colorblind-friendly colors
+colors = [
+    "#332288",
+    "#117733",
+    "#882255",
+    "#AA4499",
+    "#CC6677",
+    "#44AA99",
+    "#DDCC77",
+    "#88CCEE",
+]
+CACHE_DIR = dataset_utils.CACHE_DIR
+# String names we are using (not coming from the stored dataset).
+OUR_TEXT_FIELD = dataset_utils.OUR_TEXT_FIELD
+OUR_LABEL_FIELD = dataset_utils.OUR_LABEL_FIELD
+TOKENIZED_FIELD = dataset_utils.TOKENIZED_FIELD
+EMBEDDING_FIELD = dataset_utils.EMBEDDING_FIELD
+LENGTH_FIELD = dataset_utils.LENGTH_FIELD
+# TODO: Allow users to specify this.
+_MIN_VOCAB_COUNT = 10
+_SHOW_TOP_N_WORDS = 10
+@st.cache(
+    hash_funcs={
+        dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
+    },
+    allow_output_mutation=True,
+)
+def load_or_prepare(ds_args, show_embeddings, use_cache=False):
+    """
+    Takes the dataset arguments from the GUI and uses them to load a dataset from the Hub or, if
+    a cache for those arguments is available, to load it from the cache.
+    Args:
+        ds_args (dict): the dataset arguments defined via the streamlit app GUI
+        show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
+        use_cache (Bool) : whether the cache is used by default or not
+    Returns:
+        dstats: the computed dataset statistics (from the dataset_statistics class)
+    """
+    if not isdir(CACHE_DIR):
+        logs.warning("Creating cache")
+        # We need to preprocess everything.
+        # This should eventually all go into a prepare_dataset CLI
+        mkdir(CACHE_DIR)
+    if use_cache:
+        logs.warning("Using cache")
+    dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+    logs.warning("Loading dataset")
+    dstats.load_or_prepare_dataset()
+    logs.warning("Loading labels")
+    dstats.load_or_prepare_labels()
+    logs.warning("Loading text lengths")
+    dstats.load_or_prepare_text_lengths()
+    logs.warning("Loading duplicates")
+    dstats.load_or_prepare_text_duplicates()
+    logs.warning("Loading vocabulary")
+    dstats.load_or_prepare_vocab()
+    logs.warning("Loading general statistics...")
+    dstats.load_or_prepare_general_stats()
+    if show_embeddings:
+        logs.warning("Loading Embeddings")
+        dstats.load_or_prepare_embeddings()
+    logs.warning("Loading nPMI")
+    try:
+        dstats.load_or_prepare_npmi()
+    except:
+        logs.warning("Missing a cache for npmi")
+    logs.warning("Loading Zipf")
+    dstats.load_or_prepare_zipf()
+    return dstats
+@st.cache(
+    hash_funcs={
+        dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
+    },
+    allow_output_mutation=True,
+)
+def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
+    """
+    Loader specifically for the widgets used in the app.
+    Args:
+        ds_args:
+        show_embeddings:
+        use_cache:
+    Returns:
+    """
+    if use_cache:
+        logs.warning("Using cache")
+    if True:
+    #try:
+        dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+        # Don't recalculate; we're live
+        dstats.set_deployment(True)
+        # checks whether the cache_dir exists in deployment mode
+        # creates cache_dir if not and if in development mode
+        cache_dir_exists = dstats.check_cache_dir()
+    #except:
+    #    logs.warning("We're screwed")
+    if cache_dir_exists:
+        try:
+            # We need to have the text_dset loaded for further load_or_prepare
+            dstats.load_or_prepare_dataset()
+        except:
+            logs.warning("Missing a cache for load or prepare dataset")
+        try:
+            # Header widget
+            dstats.load_or_prepare_dset_peek()
+        except:
+            logs.warning("Missing a cache for dset peek")
+        try:
+            # General stats widget
+            dstats.load_or_prepare_general_stats()
+        except:
+            logs.warning("Missing a cache for general stats")
+        try:
+            # Labels widget
+            dstats.load_or_prepare_labels()
+        except:
+            logs.warning("Missing a cache for prepare labels")
+        try:
+            # Text lengths widget
+            dstats.load_or_prepare_text_lengths()
+        except:
+            logs.warning("Missing a cache for text lengths")
+        if show_embeddings:
+            try:
+                # Embeddings widget
+                dstats.load_or_prepare_embeddings()
+            except:
+                logs.warning("Missing a cache for embeddings")
+        try:
+            dstats.load_or_prepare_text_duplicates()
+        except:
+            logs.warning("Missing a cache for text duplicates")
+        try:
+            dstats.load_or_prepare_npmi()
+        except:
+            logs.warning("Missing a cache for npmi")
+        try:
+            dstats.load_or_prepare_zipf()
+        except:
+            logs.warning("Missing a cache for zipf")
+    return dstats, cache_dir_exists
+def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
+    """
+    Function for displaying the elements in the right column of the streamlit app.
+    Args:
+        ds_name_to_dict (dict): the dataset name and options in dictionary form
+        show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
+        column_id (str): what column of the dataset the analysis is done on
+    Returns:
+        The function displays the information using the functions defined in the st_utils class.
+    """
+    # Note that at this point we assume we can use cache; default value is True.
+    # start showing stuff
+    title_str = f"### Showing{column_id}: {dstats.dset_name} - {dstats.dset_config} - {dstats.split_name} - {'-'.join(dstats.text_field)}"
+    st.markdown(title_str)
+    logs.info("showing header")
+    st_utils.expander_header(dstats, ds_name_to_dict, column_id)
+    logs.info("showing general stats")
+    st_utils.expander_general_stats(dstats, column_id)
+    st_utils.expander_label_distribution(dstats.fig_labels, column_id)
+    st_utils.expander_text_lengths(dstats, column_id)
+    st_utils.expander_text_duplicates(dstats, column_id)
+    # Uses an interaction; handled a bit differently than other widgets.
+    logs.info("showing npmi widget")
+    st_utils.npmi_widget(dstats.npmi_stats, _MIN_VOCAB_COUNT, column_id)
+    logs.info("showing zipf")
+    st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
+    if show_embeddings:
+        st_utils.expander_text_embeddings(
+            dstats.text_dset,
+            dstats.fig_tree,
+            dstats.node_list,
+            dstats.embeddings,
+            OUR_TEXT_FIELD,
+            column_id,
+        )
+def main():
+    """ Sidebar description and selection """
+    ds_name_to_dict = dataset_utils.get_dataset_info_dicts()
+    st.title("Data Measurements Tool")
+    # Get the sidebar details
+    st_utils.sidebar_header()
+    # Set up naming, configs, and cache path.
+    compare_mode = st.sidebar.checkbox("Comparison mode")
+    # When not doing new development, use the cache.
+    use_cache = True
+    show_embeddings = st.sidebar.checkbox("Show text clusters")
+    # List of datasets for which embeddings are hard to compute:
+    if compare_mode:
+        logs.warning("Using Comparison Mode")
+        dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
+        dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
+        left_col, _, right_col = st.columns([10, 1, 10])
+        dstats_left, cache_exists_left = load_or_prepare_widgets(
+            dataset_args_left, show_embeddings, use_cache=use_cache
+        )
+        with left_col:
+            if cache_exists_left:
+                show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
+            else:
+                st.markdown("### Missing pre-computed data measures!")
+                st.write(dataset_args_left)
+        dstats_right, cache_exists_right = load_or_prepare_widgets(
+            dataset_args_right, show_embeddings, use_cache=use_cache
+        )
+        with right_col:
+            if cache_exists_right:
+                show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
+            else:
+                st.markdown("### Missing pre-computed data measures!")
+                st.write(dataset_args_right)
+    else:
+        logs.warning("Using Single Dataset Mode")
+        dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
+        dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
+        if cache_exists:
+            show_column(dstats, ds_name_to_dict, show_embeddings, "")
+        else:
+            st.markdown("### Missing pre-computed data measures!")
+            st.write(dataset_args)
+if __name__ == "__main__":
+    main()

Scripts/run.sh ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env bash
+python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --label_field="label" --feature="text"
+python3 run_data_measurements.py --dataset="hate_speech_offensive" --config="default" --split="train" --label_field="label" --feature="tweet"
+python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
+python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="unsupervised" --label_field="label" --feature="text"
+python3 run_data_measurements.py --dataset="glue" --config="cola" --split="train" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="cola" --split="validation" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="train" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="train" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_matched" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_matched" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_mismatched" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_mismatched" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="sst2" --split="train" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="sst2" --split="validation" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="train" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="train" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="validation" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="validation" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="train" --label_field="label" --feature="question1"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="train" --label_field="label" --feature="question2"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="validation" --label_field="label" --feature="question1"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="validation" --label_field="label" --feature="question2"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_matched" --split="validation" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_matched" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_mismatched" --split="validation" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_mismatched" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-raw-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-raw-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-v1" --split="validation" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-raw-v1" --split="validation" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-v1" --split="validation" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-raw-v1" --split="validation" --feature="text"
+# Superglue wsc? wic? rte? record? multirc?
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="train" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="validation" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="train" --label_field="label" --feature="passage"
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="validation" --label_field="label" --feature="passage"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="train" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="train" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="validation" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="choice1"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="choice1"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="choice2"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="choice2"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="context"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="question"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="title"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="context"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="question"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="title"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="context"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="question"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="title"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="context"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="question"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="title"

app.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from os import mkdir
+from os.path import exists, isdir
+from pathlib import Path
+# #! pip install streamlit
+import streamlit as st
+# +
+# #! pip install datasets
+# #! pip install powerlaw
+# -
+from data_measurements import dataset_statistics, dataset_utils
+from data_measurements import streamlit_utils as st_utils
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.WARNING)
+logs.propagate = False
+if not logs.handlers:
+    Path('./log_files').mkdir(exist_ok=True)
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/app.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+st.set_page_config(
+    page_title="Demo to showcase dataset metrics",
+    page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+# colorblind-friendly colors
+colors = [
+    "#332288",
+    "#117733",
+    "#882255",
+    "#AA4499",
+    "#CC6677",
+    "#44AA99",
+    "#DDCC77",
+    "#88CCEE",
+]
+CACHE_DIR = dataset_utils.CACHE_DIR
+# String names we are using (not coming from the stored dataset).
+OUR_TEXT_FIELD = dataset_utils.OUR_TEXT_FIELD
+OUR_LABEL_FIELD = dataset_utils.OUR_LABEL_FIELD
+TOKENIZED_FIELD = dataset_utils.TOKENIZED_FIELD
+EMBEDDING_FIELD = dataset_utils.EMBEDDING_FIELD
+LENGTH_FIELD = dataset_utils.LENGTH_FIELD
+# TODO: Allow users to specify this.
+_MIN_VOCAB_COUNT = 10
+_SHOW_TOP_N_WORDS = 10
+@st.cache(
+    hash_funcs={
+        dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
+    },
+    allow_output_mutation=True,
+)
+def load_or_prepare(ds_args, show_embeddings, use_cache=False):
+    """
+    Takes the dataset arguments from the GUI and uses them to load a dataset from the Hub or, if
+    a cache for those arguments is available, to load it from the cache.
+    Args:
+        ds_args (dict): the dataset arguments defined via the streamlit app GUI
+        show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
+        use_cache (Bool) : whether the cache is used by default or not
+    Returns:
+        dstats: the computed dataset statistics (from the dataset_statistics class)
+    """
+    if not isdir(CACHE_DIR):
+        logs.warning("Creating cache")
+        # We need to preprocess everything.
+        # This should eventually all go into a prepare_dataset CLI
+        mkdir(CACHE_DIR)
+    if use_cache:
+        logs.warning("Using cache")
+    dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+    logs.warning("Loading dataset")
+    dstats.load_or_prepare_dataset()
+    if show_embeddings:
+        logs.warning("Loading Embeddings")
+        dstats.load_or_prepare_embeddings()
+    logs.warning("Loading nPMI")
+    try:
+        dstats.load_or_prepare_npmi()
+    except:
+        logs.warning("Missing a cache for npmi")
+    return dstats
+@st.cache(
+    hash_funcs={
+        dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
+    },
+    allow_output_mutation=True,
+)
+def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
+    """
+    Loader specifically for the widgets used in the app.
+    Args:
+        ds_args:
+        show_embeddings:
+        use_cache:
+    Returns:
+    """
+    if use_cache:
+        logs.warning("Using cache")
+    if True:
+    #try:
+        dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
+        # Don't recalculate; we're live
+        dstats.set_deployment(True)
+        # checks whether the cache_dir exists in deployment mode
+        # creates cache_dir if not and if in development mode
+        cache_dir_exists = dstats.check_cache_dir()
+    #except:
+    #    logs.warning("We're screwed")
+    if cache_dir_exists:
+        try:
+            # We need to have the text_dset loaded for further load_or_prepare
+            dstats.load_or_prepare_dataset()
+        except:
+            logs.warning("Missing a cache for load or prepare dataset")
+        try:
+            # Header widget
+            dstats.load_or_prepare_dset_peek()
+        except:
+            logs.warning("Missing a cache for dset peek")
+        if show_embeddings:
+            try:
+                # Embeddings widget
+                dstats.load_or_prepare_embeddings()
+            except:
+                logs.warning("Missing a cache for embeddings")
+        try:
+            dstats.load_or_prepare_text_duplicates()
+        except:
+            logs.warning("Missing a cache for text duplicates")
+        try:
+            dstats.load_or_prepare_npmi()
+        except:
+            logs.warning("Missing a cache for npmi")
+    return dstats, cache_dir_exists
+def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
+    """
+    Function for displaying the elements in the right column of the streamlit app.
+    Args:
+        ds_name_to_dict (dict): the dataset name and options in dictionary form
+        show_embeddings (Bool): whether embeddings should we loaded and displayed for this dataset
+        column_id (str): what column of the dataset the analysis is done on
+    Returns:
+        The function displays the information using the functions defined in the st_utils class.
+    """
+    # Note that at this point we assume we can use cache; default value is True.
+    # start showing stuff
+    title_str = f"### Showing{column_id}: {dstats.dset_name} - {dstats.dset_config} - {dstats.split_name} - {'-'.join(dstats.text_field)}"
+    st.markdown(title_str)
+    # Uses an interaction; handled a bit differently than other widgets.
+    logs.info("showing npmi widget")
+    st_utils.npmi_widget(dstats.npmi_stats, _MIN_VOCAB_COUNT, column_id)
+    if show_embeddings:
+        st_utils.expander_text_embeddings(
+            dstats.text_dset,
+            dstats.fig_tree,
+            dstats.node_list,
+            dstats.embeddings,
+            OUR_TEXT_FIELD,
+            column_id,
+        )
+def main():
+    """ Sidebar description and selection """
+    ds_name_to_dict = dataset_utils.get_dataset_info_dicts()
+    st.title("Data Measurements Tool")
+    # Get the sidebar details
+    st_utils.sidebar_header()
+    # Set up naming, configs, and cache path.
+    compare_mode = st.sidebar.checkbox("Comparison mode")
+    # When not doing new development, use the cache.
+    use_cache = True
+    show_embeddings = st.sidebar.checkbox("Show text clusters")
+    # List of datasets for which embeddings are hard to compute:
+    if compare_mode:
+        logs.warning("Using Comparison Mode")
+        dataset_args_left = st_utils.sidebar_selection(ds_name_to_dict, " A")
+        dataset_args_right = st_utils.sidebar_selection(ds_name_to_dict, " B")
+        left_col, _, right_col = st.columns([10, 1, 10])
+        dstats_left, cache_exists_left = load_or_prepare_widgets(
+            dataset_args_left, show_embeddings, use_cache=use_cache
+        )
+        with left_col:
+            if cache_exists_left:
+                show_column(dstats_left, ds_name_to_dict, show_embeddings, " A")
+            else:
+                st.markdown("### Missing pre-computed data measures!")
+                st.write(dataset_args_left)
+        dstats_right, cache_exists_right = load_or_prepare_widgets(
+            dataset_args_right, show_embeddings, use_cache=use_cache
+        )
+        with right_col:
+            if cache_exists_right:
+                show_column(dstats_right, ds_name_to_dict, show_embeddings, " B")
+            else:
+                st.markdown("### Missing pre-computed data measures!")
+                st.write(dataset_args_right)
+    else:
+        logs.warning("Using Single Dataset Mode")
+        dataset_args = st_utils.sidebar_selection(ds_name_to_dict, "")
+        dstats, cache_exists = load_or_prepare_widgets(dataset_args, show_embeddings, use_cache=use_cache)
+        if cache_exists:
+            show_column(dstats, ds_name_to_dict, show_embeddings, "")
+        else:
+            st.markdown("### Missing pre-computed data measures!")
+            st.write(dataset_args)
+if __name__ == "__main__":
+    main()

data_measurements/__init__.py ADDED Viewed

File without changes

data_measurements/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (171 Bytes). View file

data_measurements/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (170 Bytes). View file

data_measurements/__pycache__/dataset_statistics.cpython-310.pyc ADDED Viewed

Binary file (32.6 kB). View file

data_measurements/__pycache__/dataset_statistics.cpython-311.pyc ADDED Viewed

Binary file (62.3 kB). View file

data_measurements/__pycache__/dataset_utils.cpython-310.pyc ADDED Viewed

Binary file (7.1 kB). View file

data_measurements/__pycache__/dataset_utils.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

data_measurements/__pycache__/embeddings.cpython-310.pyc ADDED Viewed

Binary file (16.5 kB). View file

data_measurements/__pycache__/embeddings.cpython-311.pyc ADDED Viewed

Binary file (28.9 kB). View file

data_measurements/__pycache__/npmi.cpython-310.pyc ADDED Viewed

Binary file (6.3 kB). View file

data_measurements/__pycache__/npmi.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

data_measurements/__pycache__/streamlit_utils.cpython-310.pyc ADDED Viewed

Binary file (16.2 kB). View file

data_measurements/__pycache__/streamlit_utils.cpython-311.pyc ADDED Viewed

Binary file (27.8 kB). View file

data_measurements/__pycache__/zipf.cpython-310.pyc ADDED Viewed

Binary file (7.26 kB). View file

data_measurements/__pycache__/zipf.cpython-311.pyc ADDED Viewed

Binary file (12.7 kB). View file

data_measurements/_pycache_/__init__.cpython-311.pyc ADDED Viewed

Binary file (237 Bytes). View file

data_measurements/_pycache_/__init__.cpython-37.pyc ADDED Viewed

Binary file (166 Bytes). View file

data_measurements/_pycache_/dataset_statistics.cpython-311.pyc ADDED Viewed

Binary file (62.4 kB). View file

data_measurements/_pycache_/dataset_statistics.cpython-37.pyc ADDED Viewed

Binary file (31.6 kB). View file

data_measurements/_pycache_/dataset_utils.cpython-311.pyc ADDED Viewed

Binary file (11.6 kB). View file

data_measurements/_pycache_/dataset_utils.cpython-37.pyc ADDED Viewed

Binary file (7.04 kB). View file

data_measurements/_pycache_/embeddings.cpython-311.pyc ADDED Viewed

Binary file (28.9 kB). View file

data_measurements/_pycache_/embeddings.cpython-37.pyc ADDED Viewed

Binary file (16.5 kB). View file

data_measurements/_pycache_/npmi.cpython-311.pyc ADDED Viewed

Binary file (12.7 kB). View file

data_measurements/_pycache_/npmi.cpython-37.pyc ADDED Viewed

Binary file (6.23 kB). View file

data_measurements/_pycache_/streamlit_utils.cpython-311.pyc ADDED Viewed

Binary file (27.8 kB). View file

data_measurements/_pycache_/zipf.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

data_measurements/_pycache_/zipf.cpython-37.pyc ADDED Viewed

Binary file (7.24 kB). View file

data_measurements/dataset_statistics.py ADDED Viewed

	@@ -0,0 +1,1223 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import statistics
+from os import mkdir
+from os.path import exists, isdir
+from os.path import join as pjoin
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+import nltk
+import numpy as np
+import pandas as pd
+import plotly
+import plotly.express as px
+import plotly.figure_factory as ff
+import plotly.graph_objects as go
+import pyarrow.feather as feather
+import seaborn as sns
+import torch
+from datasets import load_from_disk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
+from .dataset_utils import (CNT, DEDUP_TOT, EMBEDDING_FIELD, LENGTH_FIELD,
+                            OUR_LABEL_FIELD, OUR_TEXT_FIELD, PROP,
+                            TEXT_NAN_CNT, TOKENIZED_FIELD, TOT_OPEN_WORDS,
+                            TOT_WORDS, TXT_LEN, VOCAB, WORD, extract_field,
+                            load_truncated_dataset)
+from .embeddings import Embeddings
+from .npmi import nPMI
+from .zipf import Zipf
+pd.options.display.float_format = "{:,.3f}".format
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.WARNING)
+logs.propagate = False
+if not logs.handlers:
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/dataset_statistics.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+# TODO: Read this in depending on chosen language / expand beyond english
+nltk.download("stopwords")
+_CLOSED_CLASS = (
+    stopwords.words("english")
+    + [
+        "t",
+        "n",
+        "ll",
+        "d",
+        "wasn",
+        "weren",
+        "won",
+        "aren",
+        "wouldn",
+        "shouldn",
+        "didn",
+        "don",
+        "hasn",
+        "ain",
+        "couldn",
+        "doesn",
+        "hadn",
+        "haven",
+        "isn",
+        "mightn",
+        "mustn",
+        "needn",
+        "shan",
+        "would",
+        "could",
+        "dont",
+        "u",
+    ]
+    + [str(i) for i in range(0, 21)]
+)
+_IDENTITY_TERMS = [
+    "man",
+    "woman",
+    "non-binary",
+    "gay",
+    "lesbian",
+    "queer",
+    "trans",
+    "straight",
+    "cis",
+    "she",
+    "her",
+    "hers",
+    "he",
+    "him",
+    "his",
+    "they",
+    "them",
+    "their",
+    "theirs",
+    "himself",
+    "herself",
+]
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+_MIN_VOCAB_COUNT = 10
+_TREE_DEPTH = 12
+_TREE_MIN_NODES = 250
+# as long as we're using sklearn - already pushing the resources
+_MAX_CLUSTER_EXAMPLES = 5000
+_NUM_VOCAB_BATCHES = 2000
+_TOP_N = 100
+_CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
+class DatasetStatisticsCacheClass:
+    def __init__(
+        self,
+        cache_dir,
+        dset_name,
+        dset_config,
+        split_name,
+        text_field,
+        label_field,
+        label_names,
+        calculation=None,
+        use_cache=False,
+    ):
+        # This is only used for standalone runs for each kind of measurement.
+        self.calculation = calculation
+        self.our_text_field = OUR_TEXT_FIELD
+        self.our_length_field = LENGTH_FIELD
+        self.our_label_field = OUR_LABEL_FIELD
+        self.our_tokenized_field = TOKENIZED_FIELD
+        self.our_embedding_field = EMBEDDING_FIELD
+        self.cache_dir = cache_dir
+        # Use stored data if there; otherwise calculate afresh
+        self.use_cache = use_cache
+        ### What are we analyzing?
+        # name of the Hugging Face dataset
+        self.dset_name = dset_name
+        # name of the dataset config
+        self.dset_config = dset_config
+        # name of the split to analyze
+        self.split_name = split_name
+        # TODO: Chould this be "feature" ?
+        # which text fields are we analysing?
+        self.text_field = text_field
+        # which label fields are we analysing?
+        self.label_field = label_field
+        # what are the names of the classes?
+        self.label_names = label_names
+        ## Hugging Face dataset objects
+        self.dset = None  # original dataset
+        # HF dataset with all of the self.text_field instances in self.dset
+        self.text_dset = None
+        self.dset_peek = None
+        # HF dataset with text embeddings in the same order as self.text_dset
+        self.embeddings_dset = None
+        # HF dataset with all of the self.label_field instances in self.dset
+        self.label_dset = None
+        ## Data frames
+        # Tokenized text
+        self.tokenized_df = None
+        # save sentence length histogram in the class so it doesn't ge re-computed
+        self.length_df = None
+        self.fig_tok_length = None
+        # Data Frame version of self.label_dset
+        self.label_df = None
+        # save label pie chart in the class so it doesn't ge re-computed
+        self.fig_labels = None
+        # Vocabulary with word counts in the dataset
+        self.vocab_counts_df = None
+        # Vocabulary filtered to remove stopwords
+        self.vocab_counts_filtered_df = None
+        self.sorted_top_vocab_df = None
+        ## General statistics and duplicates
+        self.total_words = 0
+        self.total_open_words = 0
+        # Number of NaN values (NOT empty strings)
+        self.text_nan_count = 0
+        # Number of text items that appear more than once in the dataset
+        self.dedup_total = 0
+        # Duplicated text items along with their number of occurences ("count")
+        self.dup_counts_df = None
+        self.avg_length = None
+        self.std_length = None
+        self.general_stats_dict = None
+        self.num_uniq_lengths = 0
+        # clustering text by embeddings
+        # the hierarchical clustering tree is represented as a list of nodes,
+        # the first is the root
+        self.node_list = []
+        # save tree figure in the class so it doesn't ge re-computed
+        self.fig_tree = None
+        # keep Embeddings object around to explore clusters
+        self.embeddings = None
+        # nPMI
+        # Holds a nPMIStatisticsCacheClass object
+        self.npmi_stats = None
+        # TODO: Have lowercase be an option for a user to set.
+        self.to_lowercase = True
+        # The minimum amount of times a word should occur to be included in
+        # word-count-based calculations (currently just relevant to nPMI)
+        self.min_vocab_count = _MIN_VOCAB_COUNT
+        # zipf
+        self.z = None
+        self.zipf_fig = None
+        self.cvec = _CVEC
+        # File definitions
+        # path to the directory used for caching
+        if not isinstance(text_field, str):
+            text_field = "-".join(text_field)
+        # if isinstance(label_field, str):
+        #    label_field = label_field
+        # else:
+        #    label_field = "-".join(label_field)
+        self.cache_path = pjoin(
+            self.cache_dir,
+            f"{dset_name}_{dset_config}_{split_name}_{text_field}",  # {label_field},
+        )
+        # Cache files not needed for UI
+        self.dset_fid = pjoin(self.cache_path, "base_dset")
+        self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
+        self.label_dset_fid = pjoin(self.cache_path, "label_dset")
+        # Needed for UI -- embeddings
+        self.text_dset_fid = pjoin(self.cache_path, "text_dset")
+        # Needed for UI
+        self.dset_peek_json_fid = pjoin(self.cache_path, "dset_peek.json")
+        ## Label cache files.
+        # Needed for UI
+        self.fig_labels_json_fid = pjoin(self.cache_path, "fig_labels.json")
+        ## Length cache files
+        # Needed for UI
+        self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
+        # Needed for UI
+        self.length_stats_json_fid = pjoin(self.cache_path, "length_stats.json")
+        self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
+        # Needed for UI
+        self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
+        # Needed for UI
+        self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.png")
+        ## General text stats
+        # Needed for UI
+        self.general_stats_json_fid = pjoin(self.cache_path, "general_stats_dict.json")
+        # Needed for UI
+        self.sorted_top_vocab_df_fid = pjoin(
+            self.cache_path, "sorted_top_vocab.feather"
+        )
+        ## Zipf cache files
+        # Needed for UI
+        self.zipf_fid = pjoin(self.cache_path, "zipf_basic_stats.json")
+        # Needed for UI
+        self.zipf_fig_fid = pjoin(self.cache_path, "zipf_fig.json")
+        ## Embeddings cache files
+        # Needed for UI
+        self.node_list_fid = pjoin(self.cache_path, "node_list.th")
+        # Needed for UI
+        self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
+        self.live = False
+    def set_deployment(self, live=True):
+        """
+        Function that we can hit when we deploy, so that cache files are not
+        written out/recalculated, but instead that part of the UI can be punted.
+        """
+        self.live = live
+    def check_cache_dir(self):
+        """
+        First function to call to create the cache directory.
+        If in deployment mode and cache directory does not already exist,
+        return False.
+        """
+        if self.live:
+            return isdir(self.cache_path)
+        else:
+            if not isdir(self.cache_path):
+                logs.warning("Creating cache directory %s." % self.cache_path)
+                mkdir(self.cache_path)
+            return isdir(self.cache_path)
+    def get_base_dataset(self):
+        """Gets a pointer to the truncated base dataset object."""
+        if not self.dset:
+            self.dset = load_truncated_dataset(
+                self.dset_name,
+                self.dset_config,
+                self.split_name,
+                cache_name=self.dset_fid,
+                use_cache=True,
+                use_streaming=True,
+            )
+    def load_or_prepare_general_stats(self, save=True):
+        """
+        Content for expander_general_stats widget.
+        Provides statistics for total words, total open words,
+        the sorted top vocab, the NaN count, and the duplicate count.
+        Args:
+        Returns:
+        """
+        # General statistics
+        if (
+            self.use_cache
+            and exists(self.general_stats_json_fid)
+            and exists(self.dup_counts_df_fid)
+            and exists(self.sorted_top_vocab_df_fid)
+        ):
+            logs.info("Loading cached general stats")
+            self.load_general_stats()
+        else:
+            if not self.live:
+                logs.info("Preparing general stats")
+                self.prepare_general_stats()
+                if save:
+                    write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
+                    write_json(self.general_stats_dict, self.general_stats_json_fid)
+    def load_or_prepare_text_lengths(self, save=True):
+        """
+        The text length widget relies on this function, which provides
+        a figure of the text lengths, some text length statistics, and
+        a text length dataframe to peruse.
+        Args:
+            save:
+        Returns:
+        """
+        # Text length figure
+        if self.use_cache and exists(self.fig_tok_length_fid):
+            self.fig_tok_length_png = mpimg.imread(self.fig_tok_length_fid)
+        else:
+            if not self.live:
+                self.prepare_fig_text_lengths()
+                if save:
+                    self.fig_tok_length.savefig(self.fig_tok_length_fid)
+        # Text length dataframe
+        if self.use_cache and exists(self.length_df_fid):
+            self.length_df = feather.read_feather(self.length_df_fid)
+        else:
+            if not self.live:
+                self.prepare_length_df()
+                if save:
+                    write_df(self.length_df, self.length_df_fid)
+        # Text length stats.
+        if self.use_cache and exists(self.length_stats_json_fid):
+            with open(self.length_stats_json_fid, "r") as f:
+                self.length_stats_dict = json.load(f)
+            self.avg_length = self.length_stats_dict["avg length"]
+            self.std_length = self.length_stats_dict["std length"]
+            self.num_uniq_lengths = self.length_stats_dict["num lengths"]
+        else:
+            if not self.live:
+                self.prepare_text_length_stats()
+                if save:
+                    write_json(self.length_stats_dict, self.length_stats_json_fid)
+    def prepare_length_df(self):
+        if not self.live:
+            if self.tokenized_df is None:
+                self.tokenized_df = self.do_tokenization()
+            self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(
+                len
+            )
+            self.length_df = self.tokenized_df[
+                [LENGTH_FIELD, OUR_TEXT_FIELD]
+            ].sort_values(by=[LENGTH_FIELD], ascending=True)
+    def prepare_text_length_stats(self):
+        if not self.live:
+            if (
+                self.tokenized_df is None
+                or LENGTH_FIELD not in self.tokenized_df.columns
+                or self.length_df is None
+            ):
+                self.prepare_length_df()
+            avg_length = sum(self.tokenized_df[LENGTH_FIELD]) / len(
+                self.tokenized_df[LENGTH_FIELD]
+            )
+            self.avg_length = round(avg_length, 1)
+            std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
+            self.std_length = round(std_length, 1)
+            self.num_uniq_lengths = len(self.length_df["length"].unique())
+            self.length_stats_dict = {
+                "avg length": self.avg_length,
+                "std length": self.std_length,
+                "num lengths": self.num_uniq_lengths,
+            }
+    def prepare_fig_text_lengths(self):
+        if not self.live:
+            if (
+                self.tokenized_df is None
+                or LENGTH_FIELD not in self.tokenized_df.columns
+            ):
+                self.prepare_length_df()
+            self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
+    def load_or_prepare_embeddings(self):
+        self.embeddings = Embeddings(self, use_cache=self.use_cache)
+        self.embeddings.make_hierarchical_clustering()
+        self.node_list = self.embeddings.node_list
+        self.fig_tree = self.embeddings.fig_tree
+    # get vocab with word counts
+    def load_or_prepare_vocab(self, save=True):
+        """
+        Calculates the vocabulary count from the tokenized text.
+        The resulting dataframes may be used in nPMI calculations, zipf, etc.
+        :param
+        :return:
+        """
+        if self.use_cache and exists(self.vocab_counts_df_fid):
+            logs.info("Reading vocab from cache")
+            self.load_vocab()
+            self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
+        else:
+            logs.info("Calculating vocab afresh")
+            if self.tokenized_df is None:
+                self.tokenized_df = self.do_tokenization()
+                if save:
+                    logs.info("Writing out.")
+                    write_df(self.tokenized_df, self.tokenized_df_fid)
+            word_count_df = count_vocab_frequencies(self.tokenized_df)
+            logs.info("Making dfs with proportion.")
+            self.vocab_counts_df = calc_p_word(word_count_df)
+            self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
+            if save:
+                logs.info("Writing out.")
+                write_df(self.vocab_counts_df, self.vocab_counts_df_fid)
+        logs.info("unfiltered vocab")
+        logs.info(self.vocab_counts_df)
+        logs.info("filtered vocab")
+        logs.info(self.vocab_counts_filtered_df)
+    def load_vocab(self):
+        with open(self.vocab_counts_df_fid, "rb") as f:
+            self.vocab_counts_df = feather.read_feather(f)
+        # Handling for changes in how the index is saved.
+        self.vocab_counts_df = self._set_idx_col_names(self.vocab_counts_df)
+    def load_or_prepare_text_duplicates(self, save=True):
+        if self.use_cache and exists(self.dup_counts_df_fid):
+            with open(self.dup_counts_df_fid, "rb") as f:
+                self.dup_counts_df = feather.read_feather(f)
+        elif self.dup_counts_df is None:
+            if not self.live:
+                self.prepare_text_duplicates()
+                if save:
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
+        else:
+            if not self.live:
+                # This happens when self.dup_counts_df is already defined;
+                # This happens when general_statistics were calculated first,
+                # since general statistics requires the number of duplicates
+                if save:
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
+    def load_general_stats(self):
+        self.general_stats_dict = json.load(
+            open(self.general_stats_json_fid, encoding="utf-8")
+        )
+        with open(self.sorted_top_vocab_df_fid, "rb") as f:
+            self.sorted_top_vocab_df = feather.read_feather(f)
+        self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
+        self.dedup_total = self.general_stats_dict[DEDUP_TOT]
+        self.total_words = self.general_stats_dict[TOT_WORDS]
+        self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
+    def prepare_general_stats(self):
+        if not self.live:
+            if self.tokenized_df is None:
+                logs.warning("Tokenized dataset not yet loaded; doing so.")
+                self.load_or_prepare_tokenized_df()
+            if self.vocab_counts_df is None:
+                logs.warning("Vocab not yet loaded; doing so.")
+                self.load_or_prepare_vocab()
+            self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
+                "count", ascending=False
+            ).head(_TOP_N)
+            self.total_words = len(self.vocab_counts_df)
+            self.total_open_words = len(self.vocab_counts_filtered_df)
+            self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
+            self.prepare_text_duplicates()
+            self.dedup_total = sum(self.dup_counts_df[CNT])
+            self.general_stats_dict = {
+                TOT_WORDS: self.total_words,
+                TOT_OPEN_WORDS: self.total_open_words,
+                TEXT_NAN_CNT: self.text_nan_count,
+                DEDUP_TOT: self.dedup_total,
+            }
+    def prepare_text_duplicates(self):
+        if not self.live:
+            if self.tokenized_df is None:
+                self.load_or_prepare_tokenized_df()
+            dup_df = self.tokenized_df[self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
+            self.dup_counts_df = pd.DataFrame(
+                dup_df.pivot_table(
+                    columns=[OUR_TEXT_FIELD], aggfunc="size"
+                ).sort_values(ascending=False),
+                columns=[CNT],
+            )
+            self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
+    def load_or_prepare_dataset(self, save=True):
+        """
+        Prepares the HF datasets and data frames containing the untokenized and
+        tokenized text as well as the label values.
+        self.tokenized_df is used further for calculating text lengths,
+        word counts, etc.
+        Args:
+            save: Store the calculated data to disk.
+        Returns:
+        """
+        logs.info("Doing text dset.")
+        self.load_or_prepare_text_dset(save)
+        #logs.info("Doing tokenized dataframe")
+        #self.load_or_prepare_tokenized_df(save)
+        logs.info("Doing dataset peek")
+        self.load_or_prepare_dset_peek(save)
+    def load_or_prepare_dset_peek(self, save=True):
+        if self.use_cache and exists(self.dset_peek_json_fid):
+            with open(self.dset_peek_json_fid, "r") as f:
+                self.dset_peek = json.load(f)["dset peek"]
+        else:
+            if not self.live:
+                if self.dset is None:
+                    self.get_base_dataset()
+                self.dset_peek = self.dset[:100]
+                if save:
+                    write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
+    def load_or_prepare_tokenized_df(self, save=True):
+        if self.use_cache and exists(self.tokenized_df_fid):
+            self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
+        else:
+            if not self.live:
+                # tokenize all text instances
+                self.tokenized_df = self.do_tokenization()
+                if save:
+                    logs.warning("Saving tokenized dataset to disk")
+                    # save tokenized text
+                    write_df(self.tokenized_df, self.tokenized_df_fid)
+    def load_or_prepare_text_dset(self, save=True):
+        if self.use_cache and exists(self.text_dset_fid):
+            # load extracted text
+            self.text_dset = load_from_disk(self.text_dset_fid)
+            logs.warning("Loaded dataset from disk")
+            logs.info(self.text_dset)
+        # ...Or load it from the server and store it anew
+        else:
+            if not self.live:
+                self.prepare_text_dset()
+                if save:
+                    # save extracted text instances
+                    logs.warning("Saving dataset to disk")
+                    self.text_dset.save_to_disk(self.text_dset_fid)
+    def prepare_text_dset(self):
+        if not self.live:
+            self.get_base_dataset()
+            # extract all text instances
+            self.text_dset = self.dset.map(
+                lambda examples: extract_field(
+                    examples, self.text_field, OUR_TEXT_FIELD
+                ),
+                batched=True,
+                remove_columns=list(self.dset.features),
+            )
+            ##additon
+            self.text_dset = self.text_dset.filter(lambda ex: ex["text"] is not None)
+    def do_tokenization(self):
+        """
+        Tokenizes the dataset
+        :return:
+        """
+        if self.text_dset is None:
+            self.load_or_prepare_text_dset()
+        sent_tokenizer = self.cvec.build_tokenizer()
+        def tokenize_batch(examples):
+            # TODO: lowercase should be an option
+            res = {
+                TOKENIZED_FIELD: [
+                    tuple(sent_tokenizer(text.lower()))
+                    for text in examples[OUR_TEXT_FIELD]
+                ]
+            }
+            res[LENGTH_FIELD] = [len(tok_text) for tok_text in res[TOKENIZED_FIELD]]
+            return res
+        tokenized_dset = self.text_dset.map(
+            tokenize_batch,
+            batched=True,
+            # remove_columns=[OUR_TEXT_FIELD], keep around to print
+        )
+        tokenized_df = pd.DataFrame(tokenized_dset)
+        return tokenized_df
+    def set_label_field(self, label_field="label"):
+        """
+        Setter for label_field. Used in the CLI when a user asks for information
+         about labels, but does not specify the field;
+         'label' is assumed as a default.
+        """
+        self.label_field = label_field
+    def load_or_prepare_labels(self, save=True):
+        # TODO: This is in a transitory state for creating fig cache.
+        # Clean up to be caching and reading everything correctly.
+        """
+        Extracts labels from the Dataset
+        :return:
+        """
+        # extracted labels
+        if len(self.label_field) > 0:
+            if self.use_cache and exists(self.fig_labels_json_fid):
+                self.fig_labels = read_plotly(self.fig_labels_json_fid)
+            elif self.use_cache and exists(self.label_dset_fid):
+                # load extracted labels
+                self.label_dset = load_from_disk(self.label_dset_fid)
+                self.label_df = self.label_dset.to_pandas()
+                self.fig_labels = make_fig_labels(
+                    self.label_df, self.label_names, OUR_LABEL_FIELD
+                )
+                if save:
+                    write_plotly(self.fig_labels, self.fig_labels_json_fid)
+            else:
+                if not self.live:
+                    self.prepare_labels()
+                    if save:
+                        # save extracted label instances
+                        self.label_dset.save_to_disk(self.label_dset_fid)
+                        write_plotly(self.fig_labels, self.fig_labels_json_fid)
+    def prepare_labels(self):
+        if not self.live:
+            self.get_base_dataset()
+            self.label_dset = self.dset.map(
+                lambda examples: extract_field(
+                    examples, self.label_field, OUR_LABEL_FIELD
+                ),
+                batched=True,
+                remove_columns=list(self.dset.features),
+            )
+            self.label_df = self.label_dset.to_pandas()
+            self.fig_labels = make_fig_labels(
+                self.label_df, self.label_names, OUR_LABEL_FIELD
+            )
+    def load_or_prepare_npmi(self):
+        self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
+        self.npmi_stats.load_or_prepare_npmi_terms()
+    def load_or_prepare_zipf(self, save=True):
+        # TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
+        # when only reading from cache. Either the UI should use it, or it should
+        # be removed when reading in cache
+        if self.use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
+            with open(self.zipf_fid, "r") as f:
+                zipf_dict = json.load(f)
+            self.z = Zipf()
+            self.z.load(zipf_dict)
+            self.zipf_fig = read_plotly(self.zipf_fig_fid)
+        elif self.use_cache and exists(self.zipf_fid):
+            # TODO: Read zipf data so that the vocab is there.
+            with open(self.zipf_fid, "r") as f:
+                zipf_dict = json.load(f)
+            self.z = Zipf()
+            self.z.load(zipf_dict)
+            self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
+            if save:
+                write_plotly(self.zipf_fig, self.zipf_fig_fid)
+        else:
+            self.z = Zipf(self.vocab_counts_df)
+            self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
+            if save:
+                write_zipf_data(self.z, self.zipf_fid)
+                write_plotly(self.zipf_fig, self.zipf_fig_fid)
+    def _set_idx_col_names(self, input_vocab_df):
+        if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
+            input_vocab_df = input_vocab_df.set_index([VOCAB])
+            input_vocab_df[VOCAB] = input_vocab_df.index
+        return input_vocab_df
+class nPMIStatisticsCacheClass:
+    """ "Class to interface between the app and the nPMI class
+    by calling the nPMI class with the user's selections."""
+    def __init__(self, dataset_stats, use_cache=False):
+        self.live = dataset_stats.live
+        self.dstats = dataset_stats
+        self.pmi_cache_path = pjoin(self.dstats.cache_path, "pmi_files")
+        if not isdir(self.pmi_cache_path):
+            logs.warning("Creating pmi cache directory %s." % self.pmi_cache_path)
+            # We need to preprocess everything.
+            mkdir(self.pmi_cache_path)
+        self.joint_npmi_df_dict = {}
+        # TODO: Users ideally can type in whatever words they want.
+        self.termlist = _IDENTITY_TERMS
+        # termlist terms that are available more than _MIN_VOCAB_COUNT times
+        self.available_terms = _IDENTITY_TERMS
+        logs.info(self.termlist)
+        self.use_cache = use_cache
+        # TODO: Let users specify
+        self.open_class_only = True
+        self.min_vocab_count = self.dstats.min_vocab_count
+        self.subgroup_files = {}
+        self.npmi_terms_fid = pjoin(self.dstats.cache_path, "npmi_terms.json")
+    def load_or_prepare_npmi_terms(self):
+        """
+        Figures out what identity terms the user can select, based on whether
+        they occur more than self.min_vocab_count times
+        :return: Identity terms occurring at least self.min_vocab_count times.
+        """
+        # TODO: Add the user's ability to select subgroups.
+        # TODO: Make min_vocab_count here value selectable by the user.
+        if (
+            self.use_cache
+            and exists(self.npmi_terms_fid)
+            and json.load(open(self.npmi_terms_fid))["available terms"] != []
+        ):
+            available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
+        else:
+            true_false = [
+                term in self.dstats.vocab_counts_df.index for term in self.termlist
+            ]
+            word_list_tmp = [x for x, y in zip(self.termlist, true_false) if y]
+            true_false_counts = [
+                self.dstats.vocab_counts_df.loc[word, CNT] >= self.min_vocab_count
+                for word in word_list_tmp
+            ]
+            available_terms = [
+                word for word, y in zip(word_list_tmp, true_false_counts) if y
+            ]
+            logs.info(available_terms)
+            with open(self.npmi_terms_fid, "w+") as f:
+                json.dump({"available terms": available_terms}, f)
+        self.available_terms = available_terms
+        return available_terms
+    def load_or_prepare_joint_npmi(self, subgroup_pair):
+        """
+        Run on-the fly, while the app is already open,
+        as it depends on the subgroup terms that the user chooses
+        :param subgroup_pair:
+        :return:
+        """
+        # Canonical ordering for subgroup_list
+        subgroup_pair = sorted(subgroup_pair)
+        subgroup1 = subgroup_pair[0]
+        subgroup2 = subgroup_pair[1]
+        subgroups_str = "-".join(subgroup_pair)
+        if not isdir(self.pmi_cache_path):
+            logs.warning("Creating cache")
+            # We need to preprocess everything.
+            # This should eventually all go into a prepare_dataset CLI
+            mkdir(self.pmi_cache_path)
+        joint_npmi_fid = pjoin(self.pmi_cache_path, subgroups_str + "_npmi.csv")
+        subgroup_files = define_subgroup_files(subgroup_pair, self.pmi_cache_path)
+        # Defines the filenames for the cache files from the selected subgroups.
+        # Get as much precomputed data as we can.
+        if self.use_cache and exists(joint_npmi_fid):
+            # When everything is already computed for the selected subgroups.
+            logs.info("Loading cached joint npmi")
+            joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
+            npmi_display_cols = [
+                "npmi-bias",
+                subgroup1 + "-npmi",
+                subgroup2 + "-npmi",
+                subgroup1 + "-count",
+                subgroup2 + "-count",
+            ]
+            joint_npmi_df = joint_npmi_df[npmi_display_cols]
+            # When maybe some things have been computed for the selected subgroups.
+        else:
+            if not self.live:
+                logs.info("Preparing new joint npmi")
+                joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
+                    subgroup_pair, subgroup_files
+                )
+                # Cache new results
+                logs.info("Writing out.")
+                for subgroup in subgroup_pair:
+                    write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
+                with open(joint_npmi_fid, "w+") as f:
+                    joint_npmi_df.to_csv(f)
+            else:
+                joint_npmi_df = pd.DataFrame()
+        logs.info("The joint npmi df is")
+        logs.info(joint_npmi_df)
+        return joint_npmi_df
+    def load_joint_npmi_df(self, joint_npmi_fid):
+        """
+        Reads in a saved dataframe with all of the paired results.
+        :param joint_npmi_fid:
+        :return: paired results
+        """
+        with open(joint_npmi_fid, "rb") as f:
+            joint_npmi_df = pd.read_csv(f)
+        joint_npmi_df = self._set_idx_cols_from_cache(joint_npmi_df)
+        return joint_npmi_df.dropna()
+    def prepare_joint_npmi_df(self, subgroup_pair, subgroup_files):
+        """
+        Computs the npmi bias based on the given subgroups.
+        Handles cases where some of the selected subgroups have cached nPMI
+        computations, but other's don't, computing everything afresh if there
+        are not cached files.
+        :param subgroup_pair:
+        :return: Dataframe with nPMI for the words, nPMI bias between the words.
+        """
+        subgroup_dict = {}
+        # When npmi is computed for some (but not all) of subgroup_list
+        for subgroup in subgroup_pair:
+            logs.info("Load or failing...")
+            # When subgroup npmi has been computed in a prior session.
+            cached_results = self.load_or_fail_cached_npmi_scores(
+                subgroup, subgroup_files[subgroup]
+            )
+            # If the function did not return False and we did find it, use.
+            if cached_results:
+                # FYI: subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df = cached_results
+                # Holds the previous sessions' data for use in this session.
+                subgroup_dict[subgroup] = cached_results
+        logs.info("Calculating for subgroup list")
+        joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
+        return joint_npmi_df.dropna(), subgroup_dict
+    # TODO: Update pairwise assumption
+    def do_npmi(self, subgroup_pair, subgroup_dict):
+        """
+        Calculates nPMI for given identity terms and the nPMI bias between.
+        :param subgroup_pair: List of identity terms to calculate the bias for
+        :return: Subset of data for the UI
+        :return: Selected identity term's co-occurrence counts with
+                 other words, pmi per word, and nPMI per word.
+        """
+        logs.info("Initializing npmi class")
+        npmi_obj = self.set_npmi_obj()
+        # Canonical ordering used
+        subgroup_pair = tuple(sorted(subgroup_pair))
+        # Calculating nPMI statistics
+        for subgroup in subgroup_pair:
+            # If the subgroup data is already computed, grab it.
+            # TODO: Should we set idx and column names similarly to how we set them for cached files?
+            if subgroup not in subgroup_dict:
+                logs.info("Calculating statistics for %s" % subgroup)
+                vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
+                # Store the nPMI information for the current subgroups
+                subgroup_dict[subgroup] = (vocab_cooc_df, pmi_df, npmi_df)
+        # Pair the subgroups together, indexed by all words that
+        # co-occur between them.
+        logs.info("Computing pairwise npmi bias")
+        paired_results = npmi_obj.calc_paired_metrics(subgroup_pair, subgroup_dict)
+        UI_results = make_npmi_fig(paired_results, subgroup_pair)
+        return UI_results, subgroup_dict
+    def set_npmi_obj(self):
+        """
+        Initializes the nPMI class with the given words and tokenized sentences.
+        :return:
+        """
+        npmi_obj = nPMI(self.dstats.vocab_counts_df, self.dstats.tokenized_df)
+        return npmi_obj
+    def load_or_fail_cached_npmi_scores(self, subgroup, subgroup_fids):
+        """
+        Reads cached scores from the specified subgroup files
+        :param subgroup: string of the selected identity term
+        :return:
+        """
+        # TODO: Ordering of npmi, pmi, vocab triple should be consistent
+        subgroup_npmi_fid, subgroup_pmi_fid, subgroup_cooc_fid = subgroup_fids
+        if (
+            exists(subgroup_npmi_fid)
+            and exists(subgroup_pmi_fid)
+            and exists(subgroup_cooc_fid)
+        ):
+            logs.info("Reading in pmi data....")
+            with open(subgroup_cooc_fid, "rb") as f:
+                subgroup_cooc_df = pd.read_csv(f)
+            logs.info("pmi")
+            with open(subgroup_pmi_fid, "rb") as f:
+                subgroup_pmi_df = pd.read_csv(f)
+            logs.info("npmi")
+            with open(subgroup_npmi_fid, "rb") as f:
+                subgroup_npmi_df = pd.read_csv(f)
+            subgroup_cooc_df = self._set_idx_cols_from_cache(
+                subgroup_cooc_df, subgroup, "count"
+            )
+            subgroup_pmi_df = self._set_idx_cols_from_cache(
+                subgroup_pmi_df, subgroup, "pmi"
+            )
+            subgroup_npmi_df = self._set_idx_cols_from_cache(
+                subgroup_npmi_df, subgroup, "npmi"
+            )
+            return subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df
+        return False
+    def _set_idx_cols_from_cache(self, csv_df, subgroup=None, calc_str=None):
+        """
+        Helps make sure all of the read-in files can be accessed within code
+        via standardized indices and column names.
+        :param csv_df:
+        :param subgroup:
+        :param calc_str:
+        :return:
+        """
+        # The csv saves with this column instead of the index, so that's weird.
+        if "Unnamed: 0" in csv_df.columns:
+            csv_df = csv_df.set_index("Unnamed: 0")
+            csv_df.index.name = WORD
+        elif WORD in csv_df.columns:
+            csv_df = csv_df.set_index(WORD)
+            csv_df.index.name = WORD
+        elif VOCAB in csv_df.columns:
+            csv_df = csv_df.set_index(VOCAB)
+            csv_df.index.name = WORD
+        if subgroup and calc_str:
+            csv_df.columns = [subgroup + "-" + calc_str]
+        elif subgroup:
+            csv_df.columns = [subgroup]
+        elif calc_str:
+            csv_df.columns = [calc_str]
+        return csv_df
+    def get_available_terms(self):
+        return self.load_or_prepare_npmi_terms()
+def dummy(doc):
+    return doc
+def count_vocab_frequencies(tokenized_df):
+    """
+    Based on an input pandas DataFrame with a 'text' column,
+    this function will count the occurrences of all words.
+    :return: [num_words x num_sentences] DataFrame with the rows corresponding to the
+    different vocabulary words and the column to the presence (0 or 1) of that word.
+    """
+    cvec = CountVectorizer(
+        tokenizer=dummy,
+        preprocessor=dummy,
+    )
+    # We do this to calculate per-word statistics
+    # Fast calculation of single word counts
+    logs.info(
+        "Fitting dummy tokenization to make matrix using the previous tokenization"
+    )
+    cvec.fit(tokenized_df[TOKENIZED_FIELD])
+    document_matrix = cvec.transform(tokenized_df[TOKENIZED_FIELD])
+    batches = np.linspace(0, tokenized_df.shape[0], _NUM_VOCAB_BATCHES).astype(int)
+    i = 0
+    tf = []
+    while i < len(batches) - 1:
+        logs.info("%s of %s vocab batches" % (str(i), str(len(batches))))
+        batch_result = np.sum(
+            document_matrix[batches[i] : batches[i + 1]].toarray(), axis=0
+        )
+        tf.append(batch_result)
+        i += 1
+    word_count_df = pd.DataFrame(
+        [np.sum(tf, axis=0)], columns=cvec.get_feature_names()
+    ).transpose()
+    # Now organize everything into the dataframes
+    word_count_df.columns = [CNT]
+    word_count_df.index.name = WORD
+    return word_count_df
+def calc_p_word(word_count_df):
+    # p(word)
+    word_count_df[PROP] = word_count_df[CNT] / float(sum(word_count_df[CNT]))
+    vocab_counts_df = pd.DataFrame(word_count_df.sort_values(by=CNT, ascending=False))
+    vocab_counts_df[VOCAB] = vocab_counts_df.index
+    return vocab_counts_df
+def filter_vocab(vocab_counts_df):
+    # TODO: Add warnings (which words are missing) to log file?
+    filtered_vocab_counts_df = vocab_counts_df.drop(_CLOSED_CLASS, errors="ignore")
+    filtered_count = filtered_vocab_counts_df[CNT]
+    filtered_count_denom = float(sum(filtered_vocab_counts_df[CNT]))
+    filtered_vocab_counts_df[PROP] = filtered_count / filtered_count_denom
+    return filtered_vocab_counts_df
+## Figures ##
+def write_plotly(fig, fid):
+    write_json(plotly.io.to_json(fig), fid)
+def read_plotly(fid):
+    fig = plotly.io.from_json(json.load(open(fid, encoding="utf-8")))
+    return fig
+def make_fig_lengths(tokenized_df, length_field):
+    fig_tok_length, axs = plt.subplots(figsize=(15, 6), dpi=150)
+    sns.histplot(data=tokenized_df[length_field], kde=True, bins=100, ax=axs)
+    sns.rugplot(data=tokenized_df[length_field], ax=axs)
+    return fig_tok_length
+def make_fig_labels(label_df, label_names, label_field):
+    labels = label_df[label_field].unique()
+    label_sums = [len(label_df[label_df[label_field] == label]) for label in labels]
+    fig_labels = px.pie(label_df, values=label_sums, names=label_names)
+    return fig_labels
+def make_zipf_fig_ranked_word_list(vocab_df, unique_counts, unique_ranks):
+    ranked_words = {}
+    for count, rank in zip(unique_counts, unique_ranks):
+        vocab_df[vocab_df[CNT] == count]["rank"] = rank
+        ranked_words[rank] = ",".join(
+            vocab_df[vocab_df[CNT] == count].index.astype(str)
+        )  # Use the hovertext kw argument for hover text
+    ranked_words_list = [wrds for rank, wrds in sorted(ranked_words.items())]
+    return ranked_words_list
+def make_npmi_fig(paired_results, subgroup_pair):
+    subgroup1, subgroup2 = subgroup_pair
+    UI_results = pd.DataFrame()
+    if "npmi-bias" in paired_results:
+        UI_results["npmi-bias"] = paired_results["npmi-bias"].astype(float)
+    UI_results[subgroup1 + "-npmi"] = paired_results["npmi"][
+        subgroup1 + "-npmi"
+    ].astype(float)
+    UI_results[subgroup1 + "-count"] = paired_results["count"][
+        subgroup1 + "-count"
+    ].astype(int)
+    if subgroup1 != subgroup2:
+        UI_results[subgroup2 + "-npmi"] = paired_results["npmi"][
+            subgroup2 + "-npmi"
+        ].astype(float)
+        UI_results[subgroup2 + "-count"] = paired_results["count"][
+            subgroup2 + "-count"
+        ].astype(int)
+    return UI_results.sort_values(by="npmi-bias", ascending=True)
+def make_zipf_fig(vocab_counts_df, z):
+    zipf_counts = z.calc_zipf_counts(vocab_counts_df)
+    unique_counts = z.uniq_counts
+    unique_ranks = z.uniq_ranks
+    ranked_words_list = make_zipf_fig_ranked_word_list(
+        vocab_counts_df, unique_counts, unique_ranks
+    )
+    zmin = z.get_xmin()
+    logs.info("zipf counts is")
+    logs.info(zipf_counts)
+    layout = go.Layout(xaxis=dict(range=[0, 100]))
+    fig = go.Figure(
+        data=[
+            go.Bar(
+                x=z.uniq_ranks,
+                y=z.uniq_counts,
+                hovertext=ranked_words_list,
+                name="Word Rank Frequency",
+            )
+        ],
+        layout=layout,
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=z.uniq_ranks[zmin : len(z.uniq_ranks)],
+            y=zipf_counts[zmin : len(z.uniq_ranks)],
+            hovertext=ranked_words_list[zmin : len(z.uniq_ranks)],
+            line=go.scatter.Line(color="crimson", width=3),
+            name="Zipf Predicted Frequency",
+        )
+    )
+    # Customize aspect
+    # fig.update_traces(marker_color='limegreen',
+    #                  marker_line_width=1.5, opacity=0.6)
+    fig.update_layout(title_text="Word Counts, Observed and Predicted by Zipf")
+    fig.update_layout(xaxis_title="Word Rank")
+    fig.update_layout(yaxis_title="Frequency")
+    fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.10))
+    return fig
+## Input/Output ###
+def define_subgroup_files(subgroup_list, pmi_cache_path):
+    """
+    Sets the file ids for the input identity terms
+    :param subgroup_list: List of identity terms
+    :return:
+    """
+    subgroup_files = {}
+    for subgroup in subgroup_list:
+        # TODO: Should the pmi, npmi, and count just be one file?
+        subgroup_npmi_fid = pjoin(pmi_cache_path, subgroup + "_npmi.csv")
+        subgroup_pmi_fid = pjoin(pmi_cache_path, subgroup + "_pmi.csv")
+        subgroup_cooc_fid = pjoin(pmi_cache_path, subgroup + "_vocab_cooc.csv")
+        subgroup_files[subgroup] = (
+            subgroup_npmi_fid,
+            subgroup_pmi_fid,
+            subgroup_cooc_fid,
+        )
+    return subgroup_files
+## Input/Output ##
+def intersect_dfs(df_dict):
+    started = 0
+    new_df = None
+    for key, df in df_dict.items():
+        if df is None:
+            continue
+        for key2, df2 in df_dict.items():
+            if df2 is None:
+                continue
+            if key == key2:
+                continue
+            if started:
+                new_df = new_df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+            else:
+                new_df = df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+                started = 1
+    return new_df.copy()
+def write_df(df, df_fid):
+    feather.write_feather(df, df_fid)
+def write_json(json_dict, json_fid):
+    with open(json_fid, "w", encoding="utf-8") as f:
+        json.dump(json_dict, f)
+def write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files):
+    """
+    Saves the calculated nPMI statistics to their output files.
+    Includes the npmi scores for each identity term, the pmi scores, and the
+    co-occurrence counts of the identity term with all the other words
+    :param subgroup: Identity term
+    :return:
+    """
+    subgroup_fids = subgroup_files[subgroup]
+    subgroup_npmi_fid, subgroup_pmi_fid, subgroup_cooc_fid = subgroup_fids
+    subgroup_dfs = subgroup_dict[subgroup]
+    subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df = subgroup_dfs
+    with open(subgroup_npmi_fid, "w+") as f:
+        subgroup_npmi_df.to_csv(f)
+    with open(subgroup_pmi_fid, "w+") as f:
+        subgroup_pmi_df.to_csv(f)
+    with open(subgroup_cooc_fid, "w+") as f:
+        subgroup_cooc_df.to_csv(f)
+def write_zipf_data(z, zipf_fid):
+    zipf_dict = {}
+    zipf_dict["xmin"] = int(z.xmin)
+    zipf_dict["xmax"] = int(z.xmax)
+    zipf_dict["alpha"] = float(z.alpha)
+    zipf_dict["ks_distance"] = float(z.distance)
+    zipf_dict["p-value"] = float(z.ks_test.pvalue)
+    zipf_dict["uniq_counts"] = [int(count) for count in z.uniq_counts]
+    zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
+    with open(zipf_fid, "w+", encoding="utf-8") as f:
+        json.dump(zipf_dict, f)

data_measurements/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from dataclasses import asdict
+from os.path import exists
+import pandas as pd
+from datasets import Dataset, get_dataset_infos, load_dataset, load_from_disk
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+## String names used in Hugging Face dataset configs.
+HF_FEATURE_FIELD = "features"
+HF_LABEL_FIELD = "label"
+HF_DESC_FIELD = "description"
+CACHE_DIR = "cache_dir"
+## String names we are using within this code.
+# These are not coming from the stored dataset nor HF config,
+# but rather used as identifiers in our dicts and dataframes.
+OUR_TEXT_FIELD = "text"
+OUR_LABEL_FIELD = "label"
+TOKENIZED_FIELD = "tokenized_text"
+EMBEDDING_FIELD = "embedding"
+LENGTH_FIELD = "length"
+VOCAB = "vocab"
+WORD = "word"
+CNT = "count"
+PROP = "proportion"
+TEXT_NAN_CNT = "text_nan_count"
+TXT_LEN = "text lengths"
+DEDUP_TOT = "dedup_total"
+TOT_WORDS = "total words"
+TOT_OPEN_WORDS = "total open words"
+_DATASET_LIST = [
+    "c4",
+    "squad",
+    "squad_v2",
+    "hate_speech18",
+    "hate_speech_offensive",
+    "glue",
+    "super_glue",
+    "wikitext",
+    "imdb",
+    "HuggingFaceM4/OBELICS",
+]
+_STREAMABLE_DATASET_LIST = [
+    "c4",
+    "wikitext",
+    "HuggingFaceM4/OBELICS",
+]
+_MAX_ROWS = 100
+def load_truncated_dataset(
+    dataset_name,
+    config_name,
+    split_name,
+    num_rows=_MAX_ROWS,
+    cache_name=None,
+    use_cache=True,
+    use_streaming=True,
+):
+    """
+    This function loads the first `num_rows` items of a dataset for a
+    given `config_name` and `split_name`.
+    If `cache_name` exists, the truncated dataset is loaded from `cache_name`.
+    Otherwise, a new truncated dataset is created and immediately saved
+    to `cache_name`.
+    When the dataset is streamable, we iterate through the first
+    `num_rows` examples in streaming mode, write them to a jsonl file,
+    then create a new dataset from the json.
+    This is the most direct way to make a Dataset from an IterableDataset
+    as of datasets version 1.6.1.
+    Otherwise, we download the full dataset and select the first
+    `num_rows` items
+    Args:
+        dataset_name (string):
+            dataset id in the dataset library
+        config_name (string):
+            dataset configuration
+        split_name (string):
+            split name
+        num_rows (int):
+            number of rows to truncate the dataset to
+        cache_name (string):
+            name of the cache directory
+        use_cache (bool):
+            whether to load form the cache if it exists
+        use_streaming (bool):
+            whether to use streaming when the dataset supports it
+    Returns:
+        Dataset: the truncated dataset as a Dataset object
+    """
+    if cache_name is None:
+        cache_name = f"{dataset_name}_{config_name}_{split_name}_{num_rows}"
+    if exists(cache_name):
+        dataset = load_from_disk(cache_name)
+    else:
+        if use_streaming and dataset_name in _STREAMABLE_DATASET_LIST:
+            iterable_dataset = load_dataset(
+                dataset_name,
+                name=config_name,
+                split=split_name,
+                streaming=True,
+            ).take(num_rows)
+            rows = list(iterable_dataset)
+            f = open("temp.jsonl", "w", encoding="utf-8")
+            for row in rows:
+                _ = f.write(json.dumps(row) + "\n")
+            f.close()
+            dataset = Dataset.from_json(
+                "temp.jsonl", features=iterable_dataset.features, split=split_name
+            )
+        else:
+            full_dataset = load_dataset(
+                dataset_name,
+                name=config_name,
+                split=split_name,
+            )
+            dataset = full_dataset.select(range(num_rows))
+        dataset.save_to_disk(cache_name)
+    return dataset
+def intersect_dfs(df_dict):
+    started = 0
+    new_df = None
+    for key, df in df_dict.items():
+        if df is None:
+            continue
+        for key2, df2 in df_dict.items():
+            if df2 is None:
+                continue
+            if key == key2:
+                continue
+            if started:
+                new_df = new_df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+            else:
+                new_df = df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+                started = 1
+    return new_df.copy()
+def get_typed_features(features, ftype="string", parents=None):
+    """
+    Recursively get a list of all features of a certain dtype
+    :param features:
+    :param ftype:
+    :param parents:
+    :return: a list of tuples > e.g. ('A', 'B', 'C') for feature example['A']['B']['C']
+    """
+    if parents is None:
+        parents = []
+    typed_features = []
+    for name, feat in features.items():
+        if isinstance(feat, dict):
+            if feat.get("dtype", None) == ftype or feat.get("feature", {}).get(
+                ("dtype", None) == ftype
+            ):
+                typed_features += [tuple(parents + [name])]
+            elif "feature" in feat:
+                if feat["feature"].get("dtype", None) == ftype:
+                    typed_features += [tuple(parents + [name])]
+                elif isinstance(feat["feature"], dict):
+                    typed_features += get_typed_features(
+                        feat["feature"], ftype, parents + [name]
+                    )
+            else:
+                for k, v in feat.items():
+                    if isinstance(v, dict):
+                        typed_features += get_typed_features(
+                            v, ftype, parents + [name, k]
+                        )
+        elif name == "dtype" and feat == ftype:
+            typed_features += [tuple(parents)]
+    return typed_features
+def get_label_features(features, parents=None):
+    """
+    Recursively get a list of all features that are ClassLabels
+    :param features:
+    :param parents:
+    :return: pairs of tuples as above and the list of class names
+    """
+    if parents is None:
+        parents = []
+    label_features = []
+    for name, feat in features.items():
+        if isinstance(feat, dict):
+            if "names" in feat:
+                label_features += [(tuple(parents + [name]), feat["names"])]
+            elif "feature" in feat:
+                if "names" in feat:
+                    label_features += [
+                        (tuple(parents + [name]), feat["feature"]["names"])
+                    ]
+                elif isinstance(feat["feature"], dict):
+                    label_features += get_label_features(
+                        feat["feature"], parents + [name]
+                    )
+            else:
+                for k, v in feat.items():
+                    if isinstance(v, dict):
+                        label_features += get_label_features(v, parents + [name, k])
+        elif name == "names":
+            label_features += [(tuple(parents), feat)]
+    return label_features
+# get the info we need for the app sidebar in dict format
+def dictionarize_info(dset_info):
+    info_dict = asdict(dset_info)
+    res = {
+        "config_name": info_dict["config_name"],
+        "splits": {
+            spl: 100 #spl_info["num_examples"]
+            for spl, spl_info in info_dict["splits"].items()
+        },
+        "features": {
+            "string": get_typed_features(info_dict["features"], "string"),
+            "int32": get_typed_features(info_dict["features"], "int32"),
+            "float32": get_typed_features(info_dict["features"], "float32"),
+            "label": get_label_features(info_dict["features"]),
+        },
+        "description": dset_info.description,
+    }
+    return res
+def get_dataset_info_dicts(dataset_id=None):
+    """
+    Creates a dict from dataset configs.
+    Uses the datasets lib's get_dataset_infos
+    :return: Dictionary mapping dataset names to their configurations
+    """
+    if dataset_id != None:
+        ds_name_to_conf_dict = {
+            dataset_id: {
+                config_name: dictionarize_info(config_info)
+                for config_name, config_info in get_dataset_infos(dataset_id).items()
+            }
+        }
+    else:
+        ds_name_to_conf_dict = {
+            ds_id: {
+                config_name: dictionarize_info(config_info)
+                for config_name, config_info in get_dataset_infos(ds_id).items()
+            }
+            for ds_id in _DATASET_LIST
+        }
+    return ds_name_to_conf_dict
+# get all instances of a specific field in a dataset
+def extract_field(examples, field_path, new_field_name=None):
+    if new_field_name is None:
+        new_field_name = "_".join(field_path)
+    field_list = []
+    # TODO: Breaks the CLI if this isn't checked.
+    if isinstance(field_path, str):
+        field_path = [field_path]
+    item_list = examples[field_path[0]]
+    for field_name in field_path[1:]:
+        item_list = [
+            next_item
+            for item in item_list
+            for next_item in (
+                item[field_name]
+                if isinstance(item[field_name], list)
+                else [item[field_name]]
+            )
+        ]
+    field_list += [
+        field
+        for item in item_list
+        for field in (item if isinstance(item, list) else [item])
+    ]
+    return {new_field_name: field_list}

data_measurements/embeddings.py ADDED Viewed

	@@ -0,0 +1,550 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from os.path import exists
+from os.path import join as pjoin
+import plotly.graph_objects as go
+import torch
+import transformers
+from datasets import load_from_disk
+from plotly.io import read_json
+from tqdm import tqdm
+from .dataset_utils import EMBEDDING_FIELD
+def sentence_mean_pooling(model_output, attention_mask):
+    """Mean pooling of token embeddings for a sentence."""
+    token_embeddings = model_output[
+        0
+    ]  # First element of model_output contains all token embeddings
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    )
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+class Embeddings:
+    def __init__(
+        self,
+        dstats=None,
+        text_dset=None,
+        text_field_name="text",
+        cache_path="",
+        use_cache=False,
+    ):
+        """Item embeddings and clustering"""
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.model_name = "sentence-transformers/all-mpnet-base-v2"
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name)
+        self.model = transformers.AutoModel.from_pretrained(self.model_name).to(
+            self.device
+        )
+        self.text_dset = text_dset if dstats is None else dstats.text_dset
+        self.text_field_name = (
+            text_field_name if dstats is None else dstats.our_text_field
+        )
+        self.cache_path = cache_path if dstats is None else dstats.cache_path
+        self.embeddings_dset_fid = pjoin(self.cache_path, "embeddings_dset")
+        self.embeddings_dset = None
+        self.node_list_fid = pjoin(self.cache_path, "node_list.th")
+        self.node_list = None
+        self.nid_map = None
+        self.fig_tree_fid = pjoin(self.cache_path, "node_figure.json")
+        self.fig_tree = None
+        self.cached_clusters = {}
+        self.use_cache = use_cache
+    def compute_sentence_embeddings(self, sentences):
+        """
+        Takes a list of sentences and computes their embeddings
+        using self.tokenizer and self.model (with output dimension D)
+        followed by mean pooling of the token representations and normalization
+        Args:
+            sentences ([string]): list of N input sentences
+        Returns:
+            torch.Tensor: sentence embeddings, dimension NxD
+        """
+        batch = self.tokenizer(
+            sentences, padding=True, truncation=True, return_tensors="pt"
+        )
+        batch = {k: v.to(self.device) for k, v in batch.items()}
+        with torch.no_grad():
+            model_output = self.model(**batch)
+            sentence_embeds = sentence_mean_pooling(
+                model_output, batch["attention_mask"]
+            )
+            sentence_embeds /= sentence_embeds.norm(dim=-1, keepdim=True)
+            return sentence_embeds
+    def make_embeddings(self):
+        """
+        Batch computes the embeddings of the Dataset self.text_dset,
+        using the field self.text_field_name as input.
+        Returns:
+            Dataset: HF dataset object with a single EMBEDDING_FIELD field
+                corresponding to the embeddings (list of floats)
+        """
+        def batch_embed_sentences(sentences):
+            return {
+                EMBEDDING_FIELD: [
+                    embed.tolist()
+                    for embed in self.compute_sentence_embeddings(
+                        sentences[self.text_field_name]
+                    )
+                ]
+            }
+        self.embeddings_dset = self.text_dset.map(
+            batch_embed_sentences,
+            batched=True,
+            batch_size=32,
+            remove_columns=[self.text_field_name],
+        )
+        return self.embeddings_dset
+    def make_text_embeddings(self):
+        """Load embeddings dataset from cache or compute it."""
+        if self.use_cache and exists(self.embeddings_dset_fid):
+            self.embeddings_dset = load_from_disk(self.embeddings_dset_fid)
+        else:
+            self.embeddings_dset = self.make_embeddings()
+            self.embeddings_dset.save_to_disk(self.embeddings_dset_fid)
+    def make_hierarchical_clustering(
+        self,
+        batch_size=1000,
+        approx_neighbors=1000,
+        min_cluster_size=10,
+    ):
+        if self.use_cache and exists(self.node_list_fid):
+            self.node_list, self.nid_map = torch.load(self.node_list_fid)
+        else:
+            self.make_text_embeddings()
+            embeddings = torch.Tensor(self.embeddings_dset[EMBEDDING_FIELD])
+            self.node_list = fast_cluster(
+                embeddings, batch_size, approx_neighbors, min_cluster_size
+            )
+            self.nid_map = dict(
+                [(node["nid"], nid) for nid, node in enumerate(self.node_list)]
+            )
+            torch.save((self.node_list, self.nid_map), self.node_list_fid)
+        print(exists(self.fig_tree_fid), self.fig_tree_fid)
+        if self.use_cache and exists(self.fig_tree_fid):
+            self.fig_tree = read_json(self.fig_tree_fid)
+        else:
+            self.fig_tree = make_tree_plot(
+                self.node_list, self.nid_map, self.text_dset, self.text_field_name
+            )
+            self.fig_tree.write_json(self.fig_tree_fid)
+    def find_cluster_beam(self, sentence, beam_size=20):
+        """
+        This function finds the `beam_size` leaf clusters that are closest to the
+        proposed sentence and returns the full path from the root to the cluster
+        along with the dot product between the sentence embedding and the
+        cluster centroid
+        Args:
+            sentence (string): input sentence for which to find clusters
+            beam_size (int): this is a beam size algorithm to explore the tree
+        Returns:
+            [([int], float)]: list of (path_from_root, score) sorted by score
+        """
+        embed = self.compute_sentence_embeddings([sentence])[0].to("cpu")
+        active_paths = [([0], torch.dot(embed, self.node_list[0]["centroid"]).item())]
+        finished_paths = []
+        children_ids_list = [
+            [
+                self.nid_map[nid]
+                for nid in self.node_list[path[-1]]["children_ids"]
+                if nid in self.nid_map
+            ]
+            for path, score in active_paths
+        ]
+        while len(active_paths) > 0:
+            next_ids = sorted(
+                [
+                    (
+                        beam_id,
+                        nid,
+                        torch.dot(embed, self.node_list[nid]["centroid"]).item(),
+                    )
+                    for beam_id, children_ids in enumerate(children_ids_list)
+                    for nid in children_ids
+                ],
+                key=lambda x: x[2],
+                reverse=True,
+            )[:beam_size]
+            paths = [
+                (active_paths[beam_id][0] + [next_id], score)
+                for beam_id, next_id, score in next_ids
+            ]
+            active_paths = []
+            for path, score in paths:
+                if (
+                    len(
+                        [
+                            nid
+                            for nid in self.node_list[path[-1]]["children_ids"]
+                            if nid in self.nid_map
+                        ]
+                    )
+                    > 0
+                ):
+                    active_paths += [(path, score)]
+                else:
+                    finished_paths += [(path, score)]
+            children_ids_list = [
+                [
+                    self.nid_map[nid]
+                    for nid in self.node_list[path[-1]]["children_ids"]
+                    if nid in self.nid_map
+                ]
+                for path, score in active_paths
+            ]
+        return sorted(
+            finished_paths,
+            key=lambda x: x[-1],
+            reverse=True,
+        )[:beam_size]
+def prepare_merges(embeddings, batch_size=1000, approx_neighbors=1000, low_thres=0.5):
+    """
+    Prepares an initial list of merges for hierarchical
+    clustering. First compute the `approx_neighbors` nearest neighbors,
+    then propose a merge for any two points that are closer than `low_thres`
+    Note that if a point has more than `approx_neighbors` neighbors
+    closer than `low_thres`, this approach will miss some of those merges
+    Args:
+        embeddings (toch.Tensor): Tensor of sentence embeddings - dimension NxD
+        batch_size (int): compute nearest neighbors of `batch_size` points at a time
+        approx_neighbors (int): only keep `approx_neighbors` nearest neighbors of a point
+        low_thres (float): only return merges where the dot product is greater than `low_thres`
+    Returns:
+        torch.LongTensor: proposed merges ([i, j] with i>j) - dimension: Mx2
+        torch.Tensor: merge scores - dimension M
+    """
+    top_idx_pre = torch.cat(
+        [torch.LongTensor(range(embeddings.shape[0]))[:, None]] * batch_size, dim=1
+    )
+    top_val_all = torch.Tensor(0, approx_neighbors)
+    top_idx_all = torch.LongTensor(0, approx_neighbors)
+    n_batches = math.ceil(len(embeddings) / batch_size)
+    for b in tqdm(range(n_batches)):
+        # TODO: batch across second dimension
+        cos_scores = torch.mm(
+            embeddings[b * batch_size : (b + 1) * batch_size], embeddings.t()
+        )
+        for i in range(cos_scores.shape[0]):
+            cos_scores[i, (b * batch_size) + i :] = -1
+        top_val_large, top_idx_large = cos_scores.topk(
+            k=approx_neighbors, dim=-1, largest=True
+        )
+        top_val_all = torch.cat([top_val_all, top_val_large], dim=0)
+        top_idx_all = torch.cat([top_idx_all, top_idx_large], dim=0)
+        max_neighbor_dist = top_val_large[:, -1].max().item()
+        if max_neighbor_dist > low_thres:
+            print(
+                f"WARNING: with the current set of neireast neighbor, the farthest is {max_neighbor_dist}"
+            )
+    all_merges = torch.cat(
+        [
+            top_idx_pre[top_val_all > low_thres][:, None],
+            top_idx_all[top_val_all > low_thres][:, None],
+        ],
+        dim=1,
+    )
+    all_merge_scores = top_val_all[top_val_all > low_thres]
+    return (all_merges, all_merge_scores)
+def merge_nodes(nodes, current_thres, previous_thres, all_merges, all_merge_scores):
+    """
+    Merge all nodes if the max dot product between any of their descendants
+    is greater than current_thres.
+    Args:
+        nodes ([dict]): list of dicts representing the current set of nodes
+        current_thres (float): merge all nodes closer than current_thres
+        previous_thres (float): nodes closer than previous_thres are already merged
+        all_merges (torch.LongTensor): proposed merges ([i, j] with i>j) - dimension: Mx2
+        all_merge_scores (torch.Tensor): merge scores - dimension M
+    Returns:
+        [dict]: extended list with the newly created internal nodes
+    """
+    merge_ids = (all_merge_scores <= previous_thres) * (
+        all_merge_scores > current_thres
+    )
+    if merge_ids.sum().item() > 0:
+        merges = all_merges[merge_ids]
+        for a, b in merges.tolist():
+            node_a = nodes[a]
+            while node_a["parent_id"] != -1:
+                node_a = nodes[node_a["parent_id"]]
+            node_b = nodes[b]
+            while node_b["parent_id"] != -1:
+                node_b = nodes[node_b["parent_id"]]
+            if node_a["nid"] == node_b["nid"]:
+                continue
+            else:
+                # merge if threshold allows
+                if (node_a["depth"] + node_b["depth"]) > 0 and min(
+                    node_a["merge_threshold"], node_b["merge_threshold"]
+                ) == current_thres:
+                    merge_to = None
+                    merge_from = None
+                    if node_a["nid"] < node_b["nid"]:
+                        merge_from = node_a
+                        merge_to = node_b
+                    if node_a["nid"] > node_b["nid"]:
+                        merge_from = node_b
+                        merge_to = node_a
+                    merge_to["depth"] = max(merge_to["depth"], merge_from["depth"])
+                    merge_to["weight"] += merge_from["weight"]
+                    merge_to["children_ids"] += (
+                        merge_from["children_ids"]
+                        if merge_from["depth"] > 0
+                        else [merge_from["nid"]]
+                    )
+                    for cid in merge_from["children_ids"]:
+                        nodes[cid]["parent_id"] = merge_to["nid"]
+                    merge_from["parent_id"] = merge_to["nid"]
+                # else new node
+                else:
+                    new_nid = len(nodes)
+                    new_node = {
+                        "nid": new_nid,
+                        "parent_id": -1,
+                        "depth": max(node_a["depth"], node_b["depth"]) + 1,
+                        "weight": node_a["weight"] + node_b["weight"],
+                        "children": [],
+                        "children_ids": [node_a["nid"], node_b["nid"]],
+                        "example_ids": [],
+                        "merge_threshold": current_thres,
+                    }
+                    node_a["parent_id"] = new_nid
+                    node_b["parent_id"] = new_nid
+                    nodes += [new_node]
+    return nodes
+def finalize_node(node, nodes, min_cluster_size):
+    """Post-process nodes to sort children by descending weight,
+    get full list of leaves in the sub-tree, and direct links
+    to the cildren nodes, then recurses to all children.
+    Nodes with fewer than `min_cluster_size` descendants are collapsed
+    into a single leaf.
+    """
+    node["children"] = sorted(
+        [
+            finalize_node(nodes[cid], nodes, min_cluster_size)
+            for cid in node["children_ids"]
+        ],
+        key=lambda x: x["weight"],
+        reverse=True,
+    )
+    if node["depth"] > 0:
+        node["example_ids"] = [
+            eid for child in node["children"] for eid in child["example_ids"]
+        ]
+    node["children"] = [
+        child for child in node["children"] if child["weight"] >= min_cluster_size
+    ]
+    assert node["weight"] == len(node["example_ids"]), print(node)
+    return node
+def fast_cluster(
+    embeddings,
+    batch_size=1000,
+    approx_neighbors=1000,
+    min_cluster_size=10,
+    low_thres=0.5,
+):
+    """
+    Computes an approximate hierarchical clustering based on example
+    embeddings. The join criterion is min clustering, i.e. two clusters
+    are joined if any pair of their descendants are closer than a threshold
+    The approximate comes from the fact that only the `approx_neighbors` nearest
+    neighbors of an example are considered for merges
+    """
+    batch_size = min(embeddings.shape[0], batch_size)
+    all_merges, all_merge_scores = prepare_merges(
+        embeddings, batch_size, approx_neighbors, low_thres
+    )
+    # prepare leaves
+    nodes = [
+        {
+            "nid": nid,
+            "parent_id": -1,
+            "depth": 0,
+            "weight": 1,
+            "children": [],
+            "children_ids": [],
+            "example_ids": [nid],
+            "merge_threshold": 1.0,
+        }
+        for nid in range(embeddings.shape[0])
+    ]
+    # one level per threshold range
+    for i in range(10):
+        p_thres = 1 - i * 0.05
+        c_thres = 0.95 - i * 0.05
+        nodes = merge_nodes(nodes, c_thres, p_thres, all_merges, all_merge_scores)
+    # make root
+    root_children = [
+        node
+        for node in nodes
+        if node["parent_id"] == -1 and node["weight"] >= min_cluster_size
+    ]
+    root = {
+        "nid": len(nodes),
+        "parent_id": -1,
+        "depth": max([node["depth"] for node in root_children]) + 1,
+        "weight": sum([node["weight"] for node in root_children]),
+        "children": [],
+        "children_ids": [node["nid"] for node in root_children],
+        "example_ids": [],
+        "merge_threshold": -1.0,
+    }
+    nodes += [root]
+    for node in root_children:
+        node["parent_id"] = root["nid"]
+    # finalize tree
+    tree = finalize_node(root, nodes, min_cluster_size)
+    node_list = []
+    def rec_map_nodes(node, node_list):
+        node_list += [node]
+        for child in node["children"]:
+            rec_map_nodes(child, node_list)
+    rec_map_nodes(tree, node_list)
+    # get centroids and distances
+    for node in node_list:
+        node_embeds = embeddings[node["example_ids"]]
+        node["centroid"] = node_embeds.sum(dim=0)
+        node["centroid"] /= node["centroid"].norm()
+        node["centroid_dot_prods"] = torch.mv(node_embeds, node["centroid"])
+        node["sorted_examples_centroid"] = sorted(
+            [
+                (eid, edp.item())
+                for eid, edp in zip(node["example_ids"], node["centroid_dot_prods"])
+            ],
+            key=lambda x: x[1],
+            reverse=True,
+        )
+    return node_list
+def make_tree_plot(node_list, nid_map, text_dset, text_field_name):
+    """
+    Makes a graphical representation of the tree encoded
+    in node-list. The hover label for each node shows the number
+    of descendants and the 5 examples that are closest to the centroid
+    """
+    for nid, node in enumerate(node_list):
+        # get list of
+        node_examples = {}
+        for sid, score in node["sorted_examples_centroid"]:
+            node_examples[text_dset[sid][text_field_name]] = score
+            if len(node_examples) >= 5:
+                break
+        node["label"] = node.get(
+            "label",
+            f"{nid:2d} - {node['weight']:5d} items <br>"
+            + "<br>".join(
+                [
+                    f" {score:.2f} > {txt[:64]}" + ("..." if len(txt) >= 63 else "")
+                    for txt, score in node_examples.items()
+                ]
+            ),
+        )
+    # make plot nodes
+    labels = [node["label"] for node in node_list]
+    root = node_list[0]
+    root["X"] = 0
+    root["Y"] = 0
+    def rec_make_coordinates(node):
+        total_weight = 0
+        add_weight = len(node["example_ids"]) - sum(
+            [child["weight"] for child in node["children"]]
+        )
+        for child in node["children"]:
+            child["X"] = node["X"] + total_weight
+            child["Y"] = node["Y"] - 1
+            total_weight += child["weight"] + add_weight / len(node["children"])
+            rec_make_coordinates(child)
+    rec_make_coordinates(root)
+    E = []  # list of edges
+    Xn = []
+    Yn = []
+    Xe = []
+    Ye = []
+    for nid, node in enumerate(node_list):
+        Xn += [node["X"]]
+        Yn += [node["Y"]]
+        for child in node["children"]:
+            E += [(nid, nid_map[child["nid"]])]
+            Xe += [node["X"], child["X"], None]
+            Ye += [node["Y"], child["Y"], None]
+    # make figure
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=Xe,
+            y=Ye,
+            mode="lines",
+            line=dict(color="rgb(210,210,210)", width=1),
+            hoverinfo="none",
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=Xn,
+            y=Yn,
+            mode="markers",
+            name="nodes",
+            marker=dict(
+                symbol="circle-dot",
+                size=18,
+                color="#6175c1",
+                line=dict(color="rgb(50,50,50)", width=1)
+                # '#DB4551',
+            ),
+            text=labels,
+            hoverinfo="text",
+            opacity=0.8,
+        )
+    )
+    return fig

data_measurements/npmi.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import warnings
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import MultiLabelBinarizer
+# Might be nice to print to log instead? Happens when we drop closed class.
+warnings.filterwarnings(action="ignore", category=UserWarning)
+# When we divide by 0 in log
+np.seterr(divide="ignore")
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.INFO)
+logs.propagate = False
+if not logs.handlers:
+    Path("./log_files").mkdir(exist_ok=True)
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/npmi.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+_NUM_BATCHES = 500
+class nPMI:
+    # TODO: Expand beyond pairwise
+    def __init__(
+        self,
+        vocab_counts_df,
+        tokenized_df,
+        tokenized_col_name="tokenized_text",
+        num_batches=_NUM_BATCHES,
+    ):
+        logs.info("Initiating npmi class.")
+        logs.info("vocab is")
+        logs.info(vocab_counts_df)
+        self.vocab_counts_df = vocab_counts_df
+        logs.info("tokenized is")
+        self.tokenized_df = tokenized_df
+        logs.info(self.tokenized_df)
+        self.tokenized_col_name = tokenized_col_name
+        # self.mlb_list holds num batches x num_sentences
+        self.mlb_list = []
+    def binarize_words_in_sentence(self):
+        logs.info("Creating co-occurrence matrix for PMI calculations.")
+        batches = np.linspace(0, self.tokenized_df.shape[0], _NUM_BATCHES).astype(int)
+        i = 0
+        # Creates list of size (# batches x # sentences)
+        while i < len(batches) - 1:
+            # Makes a sparse matrix (shape: # sentences x # words),
+            # with the occurrence of each word per sentence.
+            mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index)
+            logs.info(
+                "%s of %s sentence binarize batches." % (str(i), str(len(batches)))
+            )
+            # Returns series: batch size x num_words
+            mlb_series = mlb.fit_transform(
+                self.tokenized_df[self.tokenized_col_name][batches[i] : batches[i + 1]]
+            )
+            i += 1
+            self.mlb_list.append(mlb_series)
+    def calc_cooccurrences(self, subgroup, subgroup_idx):
+        initialize = True
+        coo_df = None
+        # Big computation here!  Should only happen once.
+        logs.info(
+            "Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences."
+        )
+        if not self.mlb_list:
+            self.binarize_words_in_sentence()
+        for batch_id in range(len(self.mlb_list)):
+            logs.info(
+                "%s of %s co-occurrence count batches"
+                % (str(batch_id), str(len(self.mlb_list)))
+            )
+            # List of all the sentences (list of vocab) in that batch
+            batch_sentence_row = self.mlb_list[batch_id]
+            # Dataframe of # sentences in batch x vocabulary size
+            sent_batch_df = pd.DataFrame(batch_sentence_row)
+            # logs.info('sent batch df is')
+            # logs.info(sent_batch_df)
+            # Subgroup counts per-sentence for the given batch
+            subgroup_df = sent_batch_df[subgroup_idx]
+            subgroup_df.columns = [subgroup]
+            # Remove the sentences where the count of the subgroup is 0.
+            # This way we have less computation & resources needs.
+            subgroup_df = subgroup_df[subgroup_df > 0]
+            logs.info("Removing 0 counts, subgroup_df is")
+            logs.info(subgroup_df)
+            mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
+            logs.info("mlb subgroup only is")
+            logs.info(mlb_subgroup_only)
+            # Create cooccurrence matrix for the given subgroup and all words.
+            logs.info("Now we do the T.dot approach for co-occurrences")
+            batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))
+            # Creates a batch-sized dataframe of co-occurrence counts.
+            # Note these could just be summed rather than be batch size.
+            if initialize:
+                coo_df = batch_coo_df
+            else:
+                coo_df = coo_df.add(batch_coo_df, fill_value=0)
+            logs.info("coo_df is")
+            logs.info(coo_df)
+            initialize = False
+        logs.info("Returning co-occurrence matrix")
+        logs.info(coo_df)
+        return pd.DataFrame(coo_df)
+    def calc_paired_metrics(self, subgroup_pair, subgroup_npmi_dict):
+        """
+        Calculates nPMI metrics between paired subgroups.
+        Special handling for a subgroup paired with itself.
+        :param subgroup_npmi_dict:
+        :return:
+        """
+        paired_results_dict = {"npmi": {}, "pmi": {}, "count": {}}
+        # Canonical ordering. This is done previously, but just in case...
+        subgroup1, subgroup2 = sorted(subgroup_pair)
+        vocab_cooc_df1, pmi_df1, npmi_df1 = subgroup_npmi_dict[subgroup1]
+        logs.info("vocab cooc")
+        logs.info(vocab_cooc_df1)
+        if subgroup1 == subgroup2:
+            shared_npmi_df = npmi_df1
+            shared_pmi_df = pmi_df1
+            shared_vocab_cooc_df = vocab_cooc_df1
+        else:
+            vocab_cooc_df2, pmi_df2, npmi_df2 = subgroup_npmi_dict[subgroup2]
+            logs.info("vocab cooc2")
+            logs.info(vocab_cooc_df2)
+            # Note that lsuffix and rsuffix should not come into play.
+            shared_npmi_df = npmi_df1.join(
+                npmi_df2, how="inner", lsuffix="1", rsuffix="2"
+            )
+            shared_pmi_df = pmi_df1.join(pmi_df2, how="inner", lsuffix="1", rsuffix="2")
+            shared_vocab_cooc_df = vocab_cooc_df1.join(
+                vocab_cooc_df2, how="inner", lsuffix="1", rsuffix="2"
+            )
+            shared_vocab_cooc_df = shared_vocab_cooc_df.dropna()
+            shared_vocab_cooc_df = shared_vocab_cooc_df[
+                shared_vocab_cooc_df.index.notnull()
+            ]
+            logs.info("shared npmi df")
+            logs.info(shared_npmi_df)
+            logs.info("shared vocab df")
+            logs.info(shared_vocab_cooc_df)
+        npmi_bias = (
+            shared_npmi_df[subgroup1 + "-npmi"] - shared_npmi_df[subgroup2 + "-npmi"]
+        )
+        paired_results_dict["npmi-bias"] = npmi_bias.dropna()
+        paired_results_dict["npmi"] = shared_npmi_df.dropna()
+        paired_results_dict["pmi"] = shared_pmi_df.dropna()
+        paired_results_dict["count"] = shared_vocab_cooc_df.dropna()
+        return paired_results_dict
+    def calc_metrics(self, subgroup):
+        # Index of the subgroup word in the sparse vector
+        subgroup_idx = self.vocab_counts_df.index.get_loc(subgroup)
+        logs.info("Calculating co-occurrences...")
+        df_coo = self.calc_cooccurrences(subgroup, subgroup_idx)
+        vocab_cooc_df = self.set_idx_cols(df_coo, subgroup)
+        logs.info(vocab_cooc_df)
+        logs.info("Calculating PMI...")
+        pmi_df = self.calc_PMI(vocab_cooc_df, subgroup)
+        logs.info(pmi_df)
+        logs.info("Calculating nPMI...")
+        npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup)
+        logs.info(npmi_df)
+        return vocab_cooc_df, pmi_df, npmi_df
+    def set_idx_cols(self, df_coo, subgroup):
+        """
+        :param df_coo: Co-occurrence counts for subgroup, length is num_words
+        :return:
+        """
+        count_df = df_coo.set_index(self.vocab_counts_df.index)
+        count_df.columns = [subgroup + "-count"]
+        count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(int)
+        return count_df
+    def calc_PMI(self, vocab_cooc_df, subgroup):
+        """
+        # PMI(x;y) = h(y) - h(y|x)
+        #          = h(subgroup) - h(subgroup|word)
+        #          = log (p(subgroup|word) / p(subgroup))
+        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
+        """
+        # Calculation of p(subgroup)
+        subgroup_prob = self.vocab_counts_df.loc[subgroup]["proportion"]
+        # Calculation of p(subgroup|word) = count(subgroup,word) / count(word)
+        # Because the inidices match (the vocab words),
+        # this division doesn't need to specify the index (I think?!)
+        p_subgroup_g_word = (
+            vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df["count"]
+        )
+        logs.info("p_subgroup_g_word is")
+        logs.info(p_subgroup_g_word)
+        pmi_df = pd.DataFrame()
+        pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob)
+        # Note: A potentially faster solution for adding count, npmi,
+        # can be based on this zip idea:
+        # df_test['size_kb'],  df_test['size_mb'], df_test['size_gb'] =
+        # zip(*df_test['size'].apply(sizes))
+        return pmi_df.dropna()
+    def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup):
+        """
+        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
+        #                                           = -log(p(word|subgroup)p(word))
+        """
+        p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum(
+            vocab_cooc_df[subgroup + "-count"]
+        )
+        p_word = pmi_df.apply(
+            lambda x: self.vocab_counts_df.loc[x.name]["proportion"], axis=1
+        )
+        normalize_pmi = -np.log(p_word_g_subgroup * p_word)
+        npmi_df = pd.DataFrame()
+        npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi
+        return npmi_df.dropna()

data_measurements/streamlit_utils.py ADDED Viewed

	@@ -0,0 +1,498 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import statistics
+import json
+import pandas as pd
+import seaborn as sns
+import streamlit as st
+#from st_aggrid import AgGrid, GridOptionsBuilder
+from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
+st.set_option('deprecation.showPyplotGlobalUse', False)
+json_file_path = "cache_dir/has_cache.json"
+with open(json_file_path, "r", encoding="utf-8") as j:
+    _HAS_CACHE = json.loads(j.read())
+def sidebar_header():
+    st.sidebar.markdown(
+        """
+    This demo showcases the [dataset measures as we develop them](https://huggingface.co/blog/data-measurements-tool).
+    Right now this has a few pre-loaded datasets for which you can:
+    - view some general statistics about the text vocabulary, lengths, labels
+    - explore some distributional statistics to assess properties of the language
+    - view some comparison statistics and overview of the text distribution
+    The tool is in development, and will keep growing in utility and functionality 🤗🚧
+    """,
+        unsafe_allow_html=True,
+    )
+def sidebar_selection(ds_name_to_dict, column_id):
+    # ds_names = list(ds_name_to_dict.keys())
+    ds_names = list(_HAS_CACHE.keys())
+    with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
+        # choose a dataset to analyze
+        ds_name = st.selectbox(
+            f"Choose dataset to explore{column_id}:",
+            ds_names,
+            index=ds_names.index("hate_speech18"),
+        )
+        # choose a config to analyze
+        ds_configs = ds_name_to_dict[ds_name]
+        if ds_name == "c4":
+            config_names = ['en','en.noblocklist','realnewslike']
+        else:
+            config_names = list(ds_configs.keys())
+        config_names = list(_HAS_CACHE[ds_name].keys())
+        config_name = st.selectbox(
+            f"Choose configuration{column_id}:",
+            config_names,
+            index=0,
+        )
+        # choose a subset of num_examples
+        # TODO: Handling for multiple text features
+        #ds_config = ds_configs[config_name]
+        # text_features = ds_config[HF_FEATURE_FIELD]["string"]
+        text_features = [tuple(text_field.split('-')) for text_field in _HAS_CACHE[ds_name][config_name]]
+        # TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
+        text_field = st.selectbox(
+            f"Which text feature from the{column_id} dataset would you like to analyze?",
+            [("text",)]
+            if ds_name == "c4"
+            else [tp for tp in text_features if tp[0] != "id"],
+        )
+        # Choose a split and dataset size
+        # avail_splits = list(ds_config["splits"].keys())
+        avail_splits = list(_HAS_CACHE[ds_name][config_name]['-'.join(text_field)].keys())
+        # 12.Nov note: Removing "test" because those should not be examined
+        # without discussion of pros and cons, which we haven't done yet.
+        if "test" in avail_splits:
+            avail_splits.remove("test")
+        split = st.selectbox(
+            f"Which split from the{column_id} dataset would you like to analyze?",
+            avail_splits,
+            index=0,
+        )
+        label_field, label_names = (
+            ds_name_to_dict[ds_name][config_name][HF_FEATURE_FIELD][HF_LABEL_FIELD][0]
+            if len(
+                ds_name_to_dict[ds_name][config_name][HF_FEATURE_FIELD][HF_LABEL_FIELD]
+            )
+            > 0
+            else ((), [])
+        )
+        return {
+            "dset_name": ds_name,
+            "dset_config": config_name,
+            "split_name": split,
+            "text_field": text_field,
+            "label_field": label_field,
+            "label_names": label_names,
+        }
+def expander_header(dstats, ds_name_to_dict, column_id):
+    with st.expander(f"Dataset Description{column_id}"):
+        st.markdown(
+            ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
+        )
+        st.dataframe(dstats.dset_peek)
+def expander_general_stats(dstats, column_id):
+    with st.expander(f"General Text Statistics{column_id}"):
+        st.caption(
+            "Use this widget to check whether the terms you see most represented"
+            " in the dataset make sense for the goals of the dataset."
+        )
+        if dstats.total_words == 0:
+            st.markdown("Eh oh...not finding the file I need. 😭  Probably it will be there soon. 🤞 Check back later!")
+        else:
+            st.markdown("There are {0} total words".format(str(dstats.total_words)))
+            st.markdown(
+                "There are {0} words after removing closed "
+                "class words".format(str(dstats.total_open_words))
+            )
+            st.markdown(
+                "The most common "
+                "[open class words](https://dictionary.apa.org/open-class-words) "
+                "and their counts are: "
+            )
+            st.dataframe(dstats.sorted_top_vocab_df)
+            st.markdown(
+                "There are {0} missing values in the dataset.".format(
+                    str(dstats.text_nan_count)
+                )
+            )
+            if dstats.dedup_total > 0:
+                st.markdown(
+                    "There are {0} duplicate items in the dataset. "
+                    "For more information about the duplicates, "
+                    "click the 'Duplicates' tab below.".format(str(dstats.dedup_total))
+                )
+            else:
+                st.markdown("There are 0 duplicate items in the dataset. ")
+### Show the label distribution from the datasets
+def expander_label_distribution(fig_labels, column_id):
+    with st.expander(f"Label Distribution{column_id}", expanded=False):
+        st.caption(
+            "Use this widget to see how balanced the labels in your dataset are."
+        )
+        if fig_labels is not None:
+            st.plotly_chart(fig_labels, use_container_width=True)
+        else:
+            st.markdown("No labels were found in the dataset")
+def expander_text_lengths(dstats, column_id):
+    _TEXT_LENGTH_CAPTION = (
+        "Use this widget to identify outliers, particularly suspiciously long outliers."
+    )
+    with st.expander(f"Text Lengths{column_id}", expanded=False):
+        st.caption(_TEXT_LENGTH_CAPTION)
+        st.markdown(
+            "Below, you can see how the lengths of the text instances in your dataset are distributed."
+        )
+        st.markdown(
+            "Any unexpected peaks or valleys in the distribution may help to identify instances you want to remove or augment."
+        )
+        st.markdown(
+            "### Here is the relative frequency of different text lengths in your dataset:"
+        )
+        try:
+            st.image(dstats.fig_tok_length_png)
+        except:
+            st.pyplot(dstats.fig_tok_length, use_container_width=True)
+        st.markdown(
+            "The average length of text instances is **"
+            + str(dstats.avg_length)
+            + " words**, with a standard deviation of **"
+            + str(dstats.std_length)
+            + "**."
+        )
+        # This is quite a large file and is breaking our ability to navigate the app development.
+        # Just passing if it's not already there for launch v0
+        if dstats.length_df is not None:
+            start_id_show_lengths = st.selectbox(
+                "Show examples of length:",
+                sorted(dstats.length_df["length"].unique().tolist()),
+                key=f"select_show_length_{column_id}",
+            )
+            st.table(
+                dstats.length_df[
+                    dstats.length_df["length"] == start_id_show_lengths
+                ].set_index("length")
+            )
+### Third, use a sentence embedding model
+def expander_text_embeddings(
+    text_dset, fig_tree, node_list, embeddings, text_field, column_id
+):
+    with st.expander(f"Text Embedding Clusters{column_id}", expanded=False):
+        _EMBEDDINGS_CAPTION = """
+        ### Hierarchical Clustering of Text Fields
+        Taking in the diversity of text represented in a dataset can be
+        challenging when it is made up of hundreds to thousands of sentences.
+        Grouping these text items based on a measure of similarity can help
+        users gain some insights into their distribution.
+        The following figure shows a hierarchical clustering of the text fields
+        in the dataset based on a
+        [Sentence-Transformer](https://hf.co/sentence-transformers/all-mpnet-base-v2)
+        model. Clusters are merged if any of the embeddings in cluster A has a
+        dot product with any of the embeddings or with the centroid of cluster B
+        higher than a threshold (one threshold per level, from 0.5 to 0.95).
+        To explore the clusters, you can:
+        - hover over a node to see the 5 most representative examples (deduplicated)
+        - enter an example in the text box below to see which clusters it is most similar to
+        - select a cluster by ID to show all of its examples
+        """
+        st.markdown(_EMBEDDINGS_CAPTION)
+        st.plotly_chart(fig_tree, use_container_width=True)
+        st.markdown("---\n")
+        if st.checkbox(
+            label="Enter text to see nearest clusters",
+            key=f"search_clusters_{column_id}",
+        ):
+            compare_example = st.text_area(
+                label="Enter some text here to see which of the clusters in the dataset it is closest to",
+                key=f"search_cluster_input_{column_id}",
+            )
+            if compare_example != "":
+                paths_to_leaves = embeddings.cached_clusters.get(
+                    compare_example,
+                    embeddings.find_cluster_beam(compare_example, beam_size=50),
+                )
+                clusters_intro = ""
+                if paths_to_leaves[0][1] < 0.3:
+                    clusters_intro += (
+                        "**Warning: no close clusters found (best score <0.3). **"
+                    )
+                clusters_intro += "The closest clusters to the text entered aboce are:"
+                st.markdown(clusters_intro)
+                for path, score in paths_to_leaves[:5]:
+                    example = text_dset[
+                        node_list[path[-1]]["sorted_examples_centroid"][0][0]
+                    ][text_field][:256]
+                    st.write(
+                        f"Cluster {path[-1]:5d} | Score: {score:.3f}  \n Example: {example}"
+                    )
+                show_node_default = paths_to_leaves[0][0][-1]
+            else:
+                show_node_default = len(node_list) // 2
+        else:
+            show_node_default = len(node_list) // 2
+        st.markdown("---\n")
+        if text_dset is None:
+            st.markdown("Missing source text to show, check back later!")
+        else:
+            show_node = st.selectbox(
+                f"Choose a leaf node to explore in the{column_id} dataset:",
+                range(len(node_list)),
+                index=show_node_default,
+            )
+            node = node_list[show_node]
+            start_id = st.slider(
+                f"Show closest sentences in cluster to the centroid{column_id} starting at index:",
+                0,
+                len(node["sorted_examples_centroid"]) - 5,
+                value=0,
+                step=5,
+            )
+            for sid, sim in node["sorted_examples_centroid"][start_id : start_id + 5]:
+                # only show the first 4 lines and the first 10000 characters
+                show_text = text_dset[sid][text_field][:10000]
+                show_text = "\n".join(show_text.split("\n")[:4])
+                st.text(f"{sim:.3f} \t {show_text}")
+### Then, show duplicates
+def expander_text_duplicates(dstats, column_id):
+    # TODO: Saving/loading figure
+    with st.expander(f"Text Duplicates{column_id}", expanded=False):
+        st.caption(
+            "Use this widget to identify text strings that appear more than once."
+        )
+        st.markdown(
+            "A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
+        )
+        st.markdown("------")
+        st.write(
+            "### Here is the list of all the duplicated items and their counts in your dataset:"
+        )
+        if dstats.dup_counts_df is None or dstats.dup_counts_df.empty:
+            st.write("There are no duplicates in this dataset! 🥳")
+        else:
+            st.dataframe(dstats.dup_counts_df.reset_index(drop=True))
+def expander_npmi_description(min_vocab):
+    _NPMI_CAPTION = (
+        "Use this widget to identify problematic biases and stereotypes in your data."
+    )
+    _NPMI_CAPTION1 = """
+    nPMI scores for a word help to identify potentially
+    problematic associations, ranked by how close the association is."""
+    _NPMI_CAPTION2 = """
+    nPMI bias scores for paired words help to identify how word
+    associations are skewed between the selected selected words
+    ([Aka et al., 2021](https://arxiv.org/abs/2103.03417)).
+    """
+    st.caption(_NPMI_CAPTION)
+    st.markdown(_NPMI_CAPTION1)
+    st.markdown(_NPMI_CAPTION2)
+    st.markdown("  ")
+    st.markdown(
+        "You can select from gender and sexual orientation "
+        "identity terms that appear in the dataset at least %s "
+        "times." % min_vocab
+    )
+    st.markdown(
+        "The resulting ranked words are those that co-occur with both "
+        "identity terms.  "
+    )
+    st.markdown(
+        "The more *positive* the score, the more associated the word is with the first identity term.  "
+        "The more *negative* the score, the more associated the word is with the second identity term."
+    )
+### Finally, show Zipf stuff
+def expander_zipf(z, zipf_fig, column_id):
+    with st.expander(
+        f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
+    ):
+        try:
+            _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
+            natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
+            calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
+            powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
+            zipf_summary = (
+                    "The optimal alpha based on this dataset is: **"
+                    + str(round(z.alpha, 2))
+                    + "**, with a KS distance of: **"
+                    + str(round(z.distance, 2))
+            )
+            zipf_summary += (
+                    "**.  This was fit with a minimum rank value of: **"
+                    + str(int(z.xmin))
+                    + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
+            )
+            alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
+            xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
+            fit_results_table = pd.DataFrame.from_dict(
+                {
+                    r"Alpha:": [str("%.2f" % z.alpha)],
+                    "KS distance:": [str("%.2f" % z.distance)],
+                    "Min rank:": [str("%s" % int(z.xmin))],
+                },
+                columns=["Results"],
+                orient="index",
+            )
+            fit_results_table.index.name = column_id
+            st.caption(
+                "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
+            )
+            st.markdown(_ZIPF_CAPTION)
+            st.write(
+                """
+            A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
+    with an ideal α value of 1."""
+            )
+            st.markdown(
+                "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
+            )
+            st.markdown(
+                "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
+            )
+            st.markdown("-----")
+            st.write("### Here is your dataset's Zipf results:")
+            st.dataframe(fit_results_table)
+            st.write(zipf_summary)
+            # TODO: Nice UI version of the content in the comments.
+            # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
+            # if z.ks_test.pvalue < 0.01:
+            #    st.markdown(
+            #        "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
+            # else:
+            #    st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
+            # st.markdown("Checking the goodness of fit of our observed distribution")
+            # st.markdown("to the hypothesized power law distribution")
+            # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
+            st.plotly_chart(zipf_fig, use_container_width=True)
+            if z.alpha > 2:
+                st.markdown(alpha_warning)
+            if z.xmin > 5:
+                st.markdown(xmin_warning)
+        except:
+            st.write("Under construction! 😱 🚧")
+### Finally finally finally, show nPMI stuff.
+def npmi_widget(npmi_stats, min_vocab, column_id):
+    """
+    Part of the main app, but uses a user interaction so pulled out as its own f'n.
+    :param use_cache:
+    :param column_id:
+    :param npmi_stats:
+    :param min_vocab:
+    :return:
+    """
+    with st.expander(f"Word Association{column_id}: nPMI", expanded=False):
+        try:
+            if len(npmi_stats.available_terms) > 0:
+                expander_npmi_description(min_vocab)
+                st.markdown("-----")
+                term1 = st.selectbox(
+                    f"What is the first term you want to select?{column_id}",
+                    npmi_stats.available_terms,
+                )
+                term2 = st.selectbox(
+                    f"What is the second term you want to select?{column_id}",
+                    reversed(npmi_stats.available_terms),
+                )
+                # We calculate/grab nPMI data based on a canonical (alphabetic)
+                # subgroup ordering.
+                subgroup_pair = sorted([term1, term2])
+                try:
+                    joint_npmi_df = npmi_stats.load_or_prepare_joint_npmi(subgroup_pair)
+                    npmi_show(joint_npmi_df)
+                except KeyError:
+                    st.markdown(
+                        "**WARNING!** The nPMI for these terms has not been pre-computed, please re-run caching."
+                    )
+            else:
+                st.markdown(
+                    "No words found co-occurring with both of the selected identity terms."
+                )
+        except:
+            st.write("Under construction! 😱 🚧")
+def npmi_show(paired_results):
+    if paired_results.empty:
+        st.markdown("No words that co-occur enough times for results!  Or there's a 🐛.  Or we're still computing this one. 🤷")
+    else:
+        s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
+        # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
+        s.index.name = "word"
+        npmi_cols = s.filter(like="npmi").columns
+        count_cols = s.filter(like="count").columns
+        if s.shape[0] > 10000:
+            bias_thres = max(abs(s["npmi-bias"][5000]), abs(s["npmi-bias"][-5000]))
+            print(f"filtering with bias threshold: {bias_thres}")
+            s_filtered = s[s["npmi-bias"].abs() > bias_thres]
+        else:
+            s_filtered = s
+        # TODO: This is very different look than the duplicates table above. Should probably standardize.
+        cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
+        out_df = (
+            s_filtered.style.background_gradient(subset=npmi_cols, cmap=cm)
+            .format(subset=npmi_cols, formatter="{:,.3f}")
+            .format(subset=count_cols, formatter=int)
+            .set_properties(
+                subset=count_cols, **{"width": "10em", "text-align": "center"}
+            )
+            .set_properties(**{"align": "center"})
+            .set_caption(
+                "nPMI scores and co-occurence counts between the selected identity terms and the words they both co-occur with"
+            )
+        )  # s = pd.read_excel("output.xlsx", index_col="word")
+        st.write("### Here is your dataset's nPMI results:")
+        st.dataframe(out_df)
+### Dumping unused functions here for now
+### Second, show the distribution of text perplexities
+def expander_text_perplexities(text_label_df, sorted_sents_loss, fig_loss):
+    with st.expander("Show text perplexities A", expanded=False):
+        st.markdown("### Text perplexities A")
+        st.plotly_chart(fig_loss, use_container_width=True)
+        start_id_show_loss = st.slider(
+            "Show highest perplexity sentences in A starting at index:",
+            0,
+            text_label_df.shape[0] - 5,
+            value=0,
+            step=5,
+        )
+        for lss, sent in sorted_sents_loss[start_id_show_loss : start_id_show_loss + 5]:
+            st.text(f"{lss:.3f} {sent}")

data_measurements/zipf.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import powerlaw
+import streamlit as st
+from scipy.stats import ks_2samp
+from scipy.stats import zipf as zipf_lib
+from .dataset_utils import CNT, PROP
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.INFO)
+logs.propagate = False
+if not logs.handlers:
+    Path("./log_files").mkdir(exist_ok=True)
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/zipf.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+class Zipf:
+    def __init__(self, vocab_counts_df=pd.DataFrame()):
+        self.vocab_counts_df = vocab_counts_df
+        self.alpha = None
+        self.xmin = None
+        self.xmax = None
+        self.fit = None
+        self.ranked_words = {}
+        self.uniq_counts = []
+        self.uniq_ranks = []
+        self.uniq_fit_counts = None
+        self.term_df = None
+        self.pvalue = None
+        self.ks_test = None
+        self.distance = None
+        self.fit = None
+        self.predicted_zipf_counts = None
+        if not self.vocab_counts_df.empty:
+            logs.info("Fitting based on input vocab counts.")
+            self.calc_fit(vocab_counts_df)
+            logs.info("Getting predicted counts.")
+            self.predicted_zipf_counts = self.calc_zipf_counts(vocab_counts_df)
+    def load(self, zipf_dict):
+        self.set_xmin(zipf_dict["xmin"])
+        self.set_xmax(zipf_dict["xmax"])
+        self.set_alpha(zipf_dict["alpha"])
+        self.set_ks_distance(zipf_dict["ks_distance"])
+        self.set_p(zipf_dict["p-value"])
+        self.set_unique_ranks(zipf_dict["uniq_ranks"])
+        self.set_unique_counts(zipf_dict["uniq_counts"])
+    def calc_fit(self, vocab_counts_df):
+        """
+        Uses the powerlaw package to fit the observed frequencies to a zipfian distribution.
+        We use the KS-distance to fit, as that seems more appropriate that MLE.
+        :param vocab_counts_df:
+        :return:
+        """
+        self.vocab_counts_df = vocab_counts_df
+        # TODO: These proportions may have already been calculated.
+        vocab_counts_df[PROP] = vocab_counts_df[CNT] / float(sum(vocab_counts_df[CNT]))
+        rank_column = vocab_counts_df[CNT].rank(
+            method="dense", numeric_only=True, ascending=False
+        )
+        vocab_counts_df["rank"] = rank_column.astype("int64")
+        observed_counts = vocab_counts_df[CNT].values
+        # Note another method for determining alpha might be defined by
+        # (Newman, 2005): alpha = 1 + n * sum(ln( xi / xmin )) ^ -1
+        self.fit = powerlaw.Fit(observed_counts, fit_method="KS", discrete=True)
+        # This should probably be a pmf (not pdf); using discrete=True above.
+        # original_data=False uses only the fitted data (within xmin and xmax).
+        # pdf_bin_edges: The portion of the data within the bin.
+        # observed_pdf: The probability density function (normalized histogram)
+        # of the data.
+        pdf_bin_edges, observed_pdf = self.fit.pdf(original_data=False)
+        # See the 'Distribution' class described here for info:
+        # https://pythonhosted.org/powerlaw/#powerlaw.Fit.pdf
+        theoretical_distro = self.fit.power_law
+        # The probability density function (normalized histogram) of the
+        # theoretical distribution.
+        predicted_pdf = theoretical_distro.pdf()
+        # !!!! CRITICAL VALUE FOR ZIPF !!!!
+        self.alpha = theoretical_distro.alpha
+        # Exclusive xmin: The optimal xmin *beyond which* the scaling regime of
+        # the power law fits best.
+        self.xmin = theoretical_distro.xmin
+        self.xmax = theoretical_distro.xmax
+        self.distance = theoretical_distro.KS()
+        self.ks_test = ks_2samp(observed_pdf, predicted_pdf)
+        self.pvalue = self.ks_test[1]
+        logs.info("KS test:")
+        logs.info(self.ks_test)
+    def set_xmax(self, xmax):
+        """
+        xmax is usually None, so we add some handling to set it as the
+        maximum rank in the dataset.
+        :param xmax:
+        :return:
+        """
+        if xmax:
+            self.xmax = int(xmax)
+        elif self.uniq_counts:
+            self.xmax = int(len(self.uniq_counts))
+        elif self.uniq_ranks:
+            self.xmax = int(len(self.uniq_ranks))
+    def get_xmax(self):
+        """
+        :return:
+        """
+        if not self.xmax:
+            self.set_xmax(self.xmax)
+        return self.xmax
+    def set_p(self, p):
+        self.p = int(p)
+    def get_p(self):
+        return int(self.p)
+    def set_xmin(self, xmin):
+        self.xmin = xmin
+    def get_xmin(self):
+        if self.xmin:
+            return int(self.xmin)
+        return self.xmin
+    def set_alpha(self, alpha):
+        self.alpha = float(alpha)
+    def get_alpha(self):
+        return float(self.alpha)
+    def set_ks_distance(self, distance):
+        self.distance = float(distance)
+    def get_ks_distance(self):
+        return self.distance
+    def calc_zipf_counts(self, vocab_counts_df):
+        """
+        The fit is based on an optimal xmin (minimum rank)
+        Let's use this to make count estimates for the zipf fit,
+        by multiplying the fitted pmf value by the sum of counts above xmin.
+        :return: array of count values following the fitted pmf.
+        """
+        # TODO: Limit from above xmin to below xmax, not just above xmin.
+        counts = vocab_counts_df[CNT]
+        self.uniq_counts = list(pd.unique(counts))
+        self.uniq_ranks = list(np.arange(1, len(self.uniq_counts) + 1))
+        logs.info(self.uniq_counts)
+        logs.info(self.xmin)
+        logs.info(self.xmax)
+        # Makes sure they are ints if not None
+        xmin = self.get_xmin()
+        xmax = self.get_xmax()
+        self.uniq_fit_counts = self.uniq_counts[xmin + 1 : xmax]
+        pmf_mass = float(sum(self.uniq_fit_counts))
+        zipf_counts = np.array(
+            [self.estimate_count(rank, pmf_mass) for rank in self.uniq_ranks]
+        )
+        return zipf_counts
+    def estimate_count(self, rank, pmf_mass):
+        return int(round(zipf_lib.pmf(rank, self.alpha) * pmf_mass))
+    def set_unique_ranks(self, ranks):
+        self.uniq_ranks = ranks
+    def get_unique_ranks(self):
+        return self.uniq_ranks
+    def get_unique_fit_counts(self):
+        return self.uniq_fit_counts
+    def set_unique_counts(self, counts):
+        self.uniq_counts = counts
+    def get_unique_counts(self):
+        return self.uniq_counts
+    def set_axes(self, unique_counts, unique_ranks):
+        self.uniq_counts = unique_counts
+        self.uniq_ranks = unique_ranks
+    # TODO: Incorporate this function (not currently using)
+    def fit_others(self, fit):
+        st.markdown(
+            "_Checking log likelihood ratio to see if the data is better explained by other well-behaved distributions..._"
+        )
+        # The first value returned from distribution_compare is the log likelihood ratio
+        better_distro = False
+        trunc = fit.distribution_compare("power_law", "truncated_power_law")
+        if trunc[0] < 0:
+            st.markdown("Seems a truncated power law is a better fit.")
+            better_distro = True
+        lognormal = fit.distribution_compare("power_law", "lognormal")
+        if lognormal[0] < 0:
+            st.markdown("Seems a lognormal distribution is a better fit.")
+            st.markdown("But don't panic -- that happens sometimes with language.")
+            better_distro = True
+        exponential = fit.distribution_compare("power_law", "exponential")
+        if exponential[0] < 0:
+            st.markdown("Seems an exponential distribution is a better fit. Panic.")
+            better_distro = True
+        if not better_distro:
+            st.markdown("\nSeems your data is best fit by a power law. Celebrate!!")

log_files/app.log ADDED Viewed

	@@ -0,0 +1,59 @@

+2023-08-23 17:29:50,194:Using Single Dataset Mode
+2023-08-23 17:29:50,202:Using cache
+2023-08-23 17:34:04,702:Using Single Dataset Mode
+2023-08-23 17:43:38,030:Using Single Dataset Mode
+2023-08-23 17:43:38,035:Using cache
+2023-08-23 17:45:36,703:Using Single Dataset Mode
+2023-08-23 17:48:20,572:Using Single Dataset Mode
+2023-08-23 17:52:30,321:Using Single Dataset Mode
+2023-08-23 17:54:35,084:Using Single Dataset Mode
+2023-08-23 17:56:12,155:Using Comparison Mode
+2023-08-24 07:51:23,364:Using Single Dataset Mode
+2023-08-24 07:57:23,750:Using Single Dataset Mode
+2023-08-24 08:01:29,502:Using Single Dataset Mode
+2023-08-24 08:03:08,131:Using Single Dataset Mode
+2023-08-24 08:04:51,132:Using Single Dataset Mode
+2023-08-24 08:04:51,138:Using cache
+2023-08-24 08:10:10,454:Using Single Dataset Mode
+2023-08-24 08:15:29,052:Using Single Dataset Mode
+2023-08-24 08:15:29,060:Using cache
+2023-08-24 08:17:31,506:Using Single Dataset Mode
+2023-08-24 08:19:49,714:Using Single Dataset Mode
+2023-08-24 18:42:47,928:Using Single Dataset Mode
+2023-08-24 18:46:27,220:Using Single Dataset Mode
+2023-08-24 18:49:34,812:Using Single Dataset Mode
+2023-08-24 18:50:59,294:Using Single Dataset Mode
+2023-08-24 18:52:13,936:Using Single Dataset Mode
+2023-08-24 18:52:13,942:Using cache
+2023-08-24 18:53:35,540:Using Single Dataset Mode
+2023-08-24 18:54:55,961:Using Single Dataset Mode
+2023-08-24 18:56:59,520:Using Single Dataset Mode
+2023-08-24 18:58:22,133:Using Single Dataset Mode
+2023-08-24 19:00:13,836:Using Single Dataset Mode
+2023-08-24 19:01:23,903:Using Single Dataset Mode
+2023-08-24 20:23:51,453:Using Single Dataset Mode
+2023-08-24 20:24:59,017:Using Single Dataset Mode
+2023-08-24 20:26:46,678:Using Single Dataset Mode
+2023-08-24 20:27:59,157:Using Single Dataset Mode
+2023-08-24 20:29:31,861:Using Single Dataset Mode
+2023-08-24 20:30:48,436:Using Single Dataset Mode
+2023-08-24 20:33:15,450:Using Single Dataset Mode
+2023-08-24 20:34:29,544:Using Single Dataset Mode
+2023-08-25 08:41:31,588:Using Single Dataset Mode
+2023-08-25 08:42:41,115:Using Single Dataset Mode
+2023-08-25 08:44:16,584:Using Single Dataset Mode
+2023-09-26 00:37:43,807:Using Single Dataset Mode
+2023-09-26 02:26:14,675:Using Single Dataset Mode
+2023-09-26 02:59:35,715:Using Single Dataset Mode
+2023-09-26 02:59:35,729:Using cache
+2023-09-26 03:00:09,840:Using Single Dataset Mode
+2023-09-26 03:00:09,843:Using cache
+2023-09-26 03:07:14,181:Using Single Dataset Mode
+2023-09-26 03:07:14,191:Using cache
+2023-09-26 03:15:33,456:Using Single Dataset Mode
+2023-09-26 03:15:33,470:Using cache
+2023-09-26 03:33:45,719:Using Single Dataset Mode
+2023-09-26 03:33:45,755:Using cache
+2023-09-26 03:35:05,699:Using Single Dataset Mode
+2023-09-26 05:46:30,460:Using Single Dataset Mode
+2023-09-26 05:46:30,460:Using cache

log_files/dataset_statistics.log ADDED Viewed

	@@ -0,0 +1,4 @@

+2023-08-23 17:29:50,216:Loaded dataset from disk
+2023-08-23 17:43:38,040:Loaded dataset from disk
+2023-08-24 18:52:13,955:Loaded dataset from disk
+2023-09-26 05:46:30,524:Loaded dataset from disk

log_files/npmi.log ADDED Viewed

File without changes

log_files/zipf.log ADDED Viewed

File without changes

run.sh ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env bash
+python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --label_field="label" --feature="text"
+python3 run_data_measurements.py --dataset="hate_speech_offensive" --config="default" --split="train" --label_field="label" --feature="tweet"
+python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
+python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="unsupervised" --label_field="label" --feature="text"
+python3 run_data_measurements.py --dataset="glue" --config="cola" --split="train" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="cola" --split="validation" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="train" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="train" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_matched" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_matched" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_mismatched" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli" --split="validation_mismatched" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="mrpc" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="rte" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="stsb" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="train" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="train" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="validation" --label_field="label" --feature="sentence1"
+python3 run_data_measurements.py --dataset="glue" --config="wnli" --split="validation" --label_field="label" --feature="sentence2"
+python3 run_data_measurements.py --dataset="glue" --config="sst2" --split="train" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="sst2" --split="validation" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="train" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="train" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="validation" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="glue" --config="qnli" --split="validation" --label_field="label" --feature="sentence"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="train" --label_field="label" --feature="question1"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="train" --label_field="label" --feature="question2"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="validation" --label_field="label" --feature="question1"
+python3 run_data_measurements.py --dataset="glue" --config="qqp" --split="validation" --label_field="label" --feature="question2"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_matched" --split="validation" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_matched" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_mismatched" --split="validation" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="glue" --config="mnli_mismatched" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-raw-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-raw-v1" --split="train" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-v1" --split="validation" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-103-raw-v1" --split="validation" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-v1" --split="validation" --feature="text"
+python3 run_data_measurements.py --dataset="wikitext" --config="wikitext-2-raw-v1" --split="validation" --feature="text"
+# Superglue wsc? wic? rte? record? multirc?
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="train" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="validation" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="train" --label_field="label" --feature="passage"
+python3 run_data_measurements.py --dataset="super_glue" --config="boolq" --split="validation" --label_field="label" --feature="passage"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="train" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="train" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="super_glue" --config="cb" --split="validation" --label_field="label" --feature="hypothesis"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="premise"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="choice1"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="choice1"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="choice2"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="choice2"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="train" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="super_glue" --config="copa" --split="validation" --label_field="label" --feature="question"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="context"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="question"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="train" --feature="title"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="context"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="question"
+python3 run_data_measurements.py --dataset="squad" --config="plain_text" --split="validation" --feature="title"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="context"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="question"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="train" --feature="title"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="context"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="question"
+python3 run_data_measurements.py --dataset="squad_v2" --config="squad_v2" --split="validation" --feature="title"

run_data_measurements.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import argparse
+import json
+import textwrap
+from os import mkdir
+from os.path import join as pjoin, isdir
+from data_measurements import dataset_statistics
+from data_measurements import dataset_utils
+def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
+    """
+    Loader specifically for the widgets used in the app -- does not compute
+    intermediate files, unless they are not there and are needed for a file
+    used in the UI.
+    Does not take specifications from user; does all widgets.
+    Args:
+        ds_args: Dataset configuration settings (config name, split, etc)
+        show_embeddings: Whether to compute embeddings (slow)
+        use_cache: Whether to grab files that have already been computed
+    Returns:
+        Saves files to disk in cache_dir, if user has not specified another dir.
+    """
+    if not isdir(ds_args["cache_dir"]):
+        print("Creating cache")
+        # We need to preprocess everything.
+        # This should eventually all go into a prepare_dataset CLI
+        mkdir(ds_args["cache_dir"])
+    dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
+                                                            use_cache=use_cache)
+    # Embeddings widget
+    dstats.load_or_prepare_dataset()
+    # Header widget
+    dstats.load_or_prepare_dset_peek()
+    # General stats widget
+    dstats.load_or_prepare_general_stats()
+    # Labels widget
+    try:
+        dstats.set_label_field(ds_args['label_field'])
+        dstats.load_or_prepare_labels()
+    except:
+        pass
+    # Text lengths widget
+    dstats.load_or_prepare_text_lengths()
+    if show_embeddings:
+        # Embeddings widget
+        dstats.load_or_prepare_embeddings()
+    # Text duplicates widget
+    dstats.load_or_prepare_text_duplicates()
+    # nPMI widget
+    dstats.load_or_prepare_npmi()
+    npmi_stats = dstats.npmi_stats
+    # Handling for all pairs; in the UI, people select.
+    do_npmi(npmi_stats)
+    # Zipf widget
+    dstats.load_or_prepare_zipf()
+def load_or_prepare(dataset_args, use_cache=False):
+    """
+    Users can specify which aspects of the dataset they would like to compute.
+    This additionally computes intermediate files not used in the UI.
+    If the calculation flag is not specified by the user (-w), calculates all
+    except for embeddings, as those are quite time consuming so should be
+    specified separately.
+    Args:
+        dataset_args: Dataset configuration settings (config name, split, etc)
+        use_cache: Whether to grab files that have already been computed
+    Returns:
+        Saves files to disk in cache_dir, if user has not specified another dir.
+    """
+    all = False
+    dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args,
+                                                            use_cache=use_cache)
+    print("Loading dataset.")
+    dstats.load_or_prepare_dataset()
+    print("Dataset loaded.  Preparing vocab.")
+    dstats.load_or_prepare_vocab()
+    print("Vocab prepared.")
+    if not dataset_args["calculation"]:
+        all = True
+    if all or dataset_args["calculation"] == "general":
+        print("\n* Calculating general statistics.")
+        dstats.load_or_prepare_general_stats()
+        print("Done!")
+        print("Basic text statistics now available at %s." %
+              dstats.general_stats_json_fid)
+        print(
+            "Text duplicates now available at %s." % dstats.dup_counts_df_fid
+        )
+    if all or dataset_args["calculation"] == "lengths":
+        print("\n* Calculating text lengths.")
+        dstats.load_or_prepare_text_lengths()
+        print("Done!")
+    if all or dataset_args["calculation"] == "labels":
+        if not dstats.label_field:
+            print("Warning: You asked for label calculation, but didn't "
+                  "provide the labels field name.  Assuming it is 'label'...")
+            dstats.set_label_field("label")
+        else:
+            print("\n* Calculating label distribution.")
+            dstats.load_or_prepare_labels()
+            fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
+            fig_label_json = pjoin(dstats.cache_path, "labels.json")
+            dstats.fig_labels.write_html(fig_label_html)
+            with open(fig_label_json, "w+") as f:
+                json.dump(dstats.fig_labels.to_json(), f)
+            print("Done!")
+            print("Label distribution now available at %s." %
+                  dstats.label_dset_fid)
+            print("Figure saved to %s." % fig_label_html)
+    if all or dataset_args["calculation"] == "npmi":
+        print("\n* Preparing nPMI.")
+        npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
+            dstats, use_cache=use_cache
+        )
+        do_npmi(npmi_stats)
+        print("Done!")
+        print(
+            "nPMI results now available in %s for all identity terms that "
+            "occur more than 10 times and all words that "
+            "co-occur with both terms."
+            % npmi_stats.pmi_cache_path
+        )
+    if all or dataset_args["calculation"] == "zipf":
+        print("\n* Preparing Zipf.")
+        zipf_fig_fid = pjoin(dstats.cache_path, "zipf_fig.html")
+        zipf_json_fid = pjoin(dstats.cache_path, "zipf_fig.json")
+        dstats.load_or_prepare_zipf()
+        zipf_fig = dstats.zipf_fig
+        with open(zipf_json_fid, "w+") as f:
+            json.dump(zipf_fig.to_json(), f)
+        zipf_fig.write_html(zipf_fig_fid)
+        print("Done!")
+        print("Zipf results now available at %s." % dstats.zipf_fid)
+        print(
+            "Figure saved to %s, with corresponding json at %s."
+            % (zipf_fig_fid, zipf_json_fid)
+        )
+    # Don't do this one until someone specifically asks for it -- takes awhile.
+    if dataset_args["calculation"] == "embeddings":
+        print("\n* Preparing text embeddings.")
+        dstats.load_or_prepare_embeddings()
+def do_npmi(npmi_stats):
+    available_terms = npmi_stats.load_or_prepare_npmi_terms()
+    completed_pairs = {}
+    print("Iterating through terms for joint npmi.")
+    for term1 in available_terms:
+        for term2 in available_terms:
+            if term1 != term2:
+                sorted_terms = tuple(sorted([term1, term2]))
+                if sorted_terms not in completed_pairs:
+                    term1, term2 = sorted_terms
+                    print("Computing nPMI statistics for %s and %s" % (term1, term2))
+                    _ = npmi_stats.load_or_prepare_joint_npmi(sorted_terms)
+                    completed_pairs[tuple(sorted_terms)] = {}
+def get_text_label_df(
+    ds_name,
+    config_name,
+    split_name,
+    text_field,
+    label_field,
+    calculation,
+    out_dir,
+    use_cache=True,
+):
+    if not use_cache:
+        print("Not using any cache; starting afresh")
+    ds_name_to_dict = dataset_utils.get_dataset_info_dicts(ds_name)
+    if label_field:
+        label_field, label_names = (
+            ds_name_to_dict[ds_name][config_name]["features"][label_field][0]
+            if len(ds_name_to_dict[ds_name][config_name]["features"][label_field]) > 0
+            else ((), [])
+        )
+    else:
+        label_field = ()
+        label_names = []
+    dataset_args = {
+        "dset_name": ds_name,
+        "dset_config": config_name,
+        "split_name": split_name,
+        "text_field": text_field,
+        "label_field": label_field,
+        "label_names": label_names,
+        "calculation": calculation,
+        "cache_dir": out_dir,
+    }
+    load_or_prepare(dataset_args, use_cache=use_cache)
+def main():
+    # TODO: Make this the Hugging Face arg parser
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent(
+            """
+         Example for hate speech18 dataset:
+         python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --feature="text"
+         Example for IMDB dataset:
+         python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
+         """
+        ),
+    )
+    parser.add_argument(
+        "-d", "--dataset", required=True, help="Name of dataset to prepare"
+    )
+    parser.add_argument(
+        "-c", "--config", required=True, help="Dataset configuration to prepare"
+    )
+    parser.add_argument(
+        "-s", "--split", required=True, type=str, help="Dataset split to prepare"
+    )
+    parser.add_argument(
+        "-f",
+        "--feature",
+        required=True,
+        type=str,
+        default="text",
+        help="Text column to prepare",
+    )
+    parser.add_argument(
+        "-w",
+        "--calculation",
+        help="""What to calculate (defaults to everything except embeddings).\n
+                                                    Options are:\n
+                                                    - `general` (for duplicate counts, missing values, length statistics.)\n
+                                                    - `lengths` for text length distribution\n
+                                                    - `labels` for label distribution\n
+                                                    - `embeddings` (Warning: Slow.)\n
+                                                    - `npmi` for word associations\n
+                                                    - `zipf` for zipfian statistics
+                                                    """,
+    )
+    parser.add_argument(
+        "-l",
+        "--label_field",
+        type=str,
+        required=False,
+        default="",
+        help="Field name for label column in dataset (Required if there is a label field that you want information about)",
+    )
+    parser.add_argument(
+        "--cached",
+        default=False,
+        required=False,
+        action="store_true",
+        help="Whether to use cached files (Optional)",
+    )
+    parser.add_argument(
+        "--do_html",
+        default=False,
+        required=False,
+        action="store_true",
+        help="Whether to write out corresponding HTML files (Optional)",
+    )
+    parser.add_argument("--out_dir", default="cache_dir", help="Where to write out to.")
+    args = parser.parse_args()
+    print("Proceeding with the following arguments:")
+    print(args)
+    # run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
+    get_text_label_df(args.dataset, args.config, args.split, args.feature,
+                      args.label_field, args.calculation, args.out_dir,
+                      use_cache=args.cached)
+    print()
+if __name__ == "__main__":
+    main()

temp.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff