Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg-huggingface commited on Dec 7, 2021

Commit

e8ac901

•

1 Parent(s): 2981bb2

Merging back dataset statistics

Browse files

Files changed (1) hide show

data_measurements/dataset_statistics.py +1313 -0

data_measurements/dataset_statistics.py ADDED Viewed

	@@ -0,0 +1,1313 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import statistics
+import torch
+from os import mkdir
+from os.path import exists, isdir
+from os.path import join as pjoin
+import nltk
+import numpy as np
+import pandas as pd
+import plotly
+import plotly.express as px
+import plotly.figure_factory as ff
+import plotly.graph_objects as go
+import pyarrow.feather as feather
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+import seaborn as sns
+from datasets import load_from_disk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
+from .dataset_utils import (
+TOT_WORDS,
+TOT_OPEN_WORDS,
+    CNT,
+    DEDUP_TOT,
+    EMBEDDING_FIELD,
+    LENGTH_FIELD,
+    OUR_LABEL_FIELD,
+    OUR_TEXT_FIELD,
+    PROP,
+    TEXT_NAN_CNT,
+    TOKENIZED_FIELD,
+    TXT_LEN,
+    VOCAB,
+    WORD,
+    extract_field,
+    load_truncated_dataset,
+)
+from .embeddings import Embeddings
+from .npmi import nPMI
+from .zipf import Zipf
+pd.options.display.float_format = "{:,.3f}".format
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.WARNING)
+logs.propagate = False
+if not logs.handlers:
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/dataset_statistics.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+# TODO: Read this in depending on chosen language / expand beyond english
+nltk.download("stopwords")
+_CLOSED_CLASS = (
+    stopwords.words("english")
+    + [
+        "t",
+        "n",
+        "ll",
+        "d",
+        "wasn",
+        "weren",
+        "won",
+        "aren",
+        "wouldn",
+        "shouldn",
+        "didn",
+        "don",
+        "hasn",
+        "ain",
+        "couldn",
+        "doesn",
+        "hadn",
+        "haven",
+        "isn",
+        "mightn",
+        "mustn",
+        "needn",
+        "shan",
+        "would",
+        "could",
+        "dont",
+        "u",
+    ]
+    + [str(i) for i in range(0, 21)]
+)
+_IDENTITY_TERMS = [
+    "man",
+    "woman",
+    "non-binary",
+    "gay",
+    "lesbian",
+    "queer",
+    "trans",
+    "straight",
+    "cis",
+    "she",
+    "her",
+    "hers",
+    "he",
+    "him",
+    "his",
+    "they",
+    "them",
+    "their",
+    "theirs",
+    "himself",
+    "herself",
+]
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+_MIN_VOCAB_COUNT = 10
+_TREE_DEPTH = 12
+_TREE_MIN_NODES = 250
+# as long as we're using sklearn - already pushing the resources
+_MAX_CLUSTER_EXAMPLES = 5000
+_NUM_VOCAB_BATCHES = 2000
+_TOP_N = 100
+_CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
+class DatasetStatisticsCacheClass:
+    def __init__(
+        self,
+        cache_dir,
+        dset_name,
+        dset_config,
+        split_name,
+        text_field,
+        label_field,
+        label_names,
+        calculation=None,
+        use_cache=False,
+    ):
+        # This is only used for standalone runs for each kind of measurement.
+        self.calculation = calculation
+        self.our_text_field = OUR_TEXT_FIELD
+        self.our_length_field = LENGTH_FIELD
+        self.our_label_field = OUR_LABEL_FIELD
+        self.our_tokenized_field = TOKENIZED_FIELD
+        self.our_embedding_field = EMBEDDING_FIELD
+        self.cache_dir = cache_dir
+        # Use stored data if there; otherwise calculate afresh
+        self.use_cache = use_cache
+        ### What are we analyzing?
+        # name of the Hugging Face dataset
+        self.dset_name = dset_name
+        # name of the dataset config
+        self.dset_config = dset_config
+        # name of the split to analyze
+        self.split_name = split_name
+        # TODO: Chould this be "feature" ?
+        # which text fields are we analysing?
+        self.text_field = text_field
+        # which label fields are we analysing?
+        self.label_field = label_field
+        # what are the names of the classes?
+        self.label_names = label_names
+        ## Hugging Face dataset objects
+        self.dset = None  # original dataset
+        # HF dataset with all of the self.text_field instances in self.dset
+        self.text_dset = None
+        self.dset_peek = None
+        # HF dataset with text embeddings in the same order as self.text_dset
+        self.embeddings_dset = None
+        # HF dataset with all of the self.label_field instances in self.dset
+        self.label_dset = None
+        ## Data frames
+        # Tokenized text
+        self.tokenized_df = None
+        # save sentence length histogram in the class so it doesn't ge re-computed
+        self.length_df = None
+        self.fig_tok_length = None
+        # Data Frame version of self.label_dset
+        self.label_df = None
+        # save label pie chart in the class so it doesn't ge re-computed
+        self.fig_labels = None
+        # Vocabulary with word counts in the dataset
+        self.vocab_counts_df = None
+        # Vocabulary filtered to remove stopwords
+        self.vocab_counts_filtered_df = None
+        self.sorted_top_vocab_df = None
+        ## General statistics and duplicates
+        self.total_words = 0
+        self.total_open_words = 0
+        # Number of NaN values (NOT empty strings)
+        self.text_nan_count = 0
+        # Number of text items that appear more than once in the dataset
+        self.dedup_total = 0
+        # Duplicated text items along with their number of occurences ("count")
+        self.dup_counts_df = None
+        self.avg_length = None
+        self.std_length = None
+        self.general_stats_dict = None
+        self.num_uniq_lengths = 0
+        # clustering text by embeddings
+        # the hierarchical clustering tree is represented as a list of nodes,
+        # the first is the root
+        self.node_list = []
+        # save tree figure in the class so it doesn't ge re-computed
+        self.fig_tree = None
+        # keep Embeddings object around to explore clusters
+        self.embeddings = None
+        # nPMI
+        # Holds a nPMIStatisticsCacheClass object
+        self.npmi_stats = None
+        # TODO: Have lowercase be an option for a user to set.
+        self.to_lowercase = True
+        # The minimum amount of times a word should occur to be included in
+        # word-count-based calculations (currently just relevant to nPMI)
+        self.min_vocab_count = _MIN_VOCAB_COUNT
+        # zipf
+        self.z = None
+        self.zipf_fig = None
+        self.cvec = _CVEC
+        # File definitions
+        # path to the directory used for caching
+        if not isinstance(text_field, str):
+            text_field = "-".join(text_field)
+        #if isinstance(label_field, str):
+        #    label_field = label_field
+        #else:
+        #    label_field = "-".join(label_field)
+        self.cache_path = pjoin(
+            self.cache_dir,
+            f"{dset_name}_{dset_config}_{split_name}_{text_field}", #{label_field},
+        )
+        if not isdir(self.cache_path):
+            logs.warning("Creating cache directory %s." % self.cache_path)
+            mkdir(self.cache_path)
+        # Cache files not needed for UI
+        self.dset_fid = pjoin(self.cache_path, "base_dset")
+        self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
+        self.label_dset_fid = pjoin(self.cache_path, "label_dset")
+        # Needed for UI -- embeddings
+        self.text_dset_fid = pjoin(self.cache_path, "text_dset")
+        # Needed for UI
+        self.dset_peek_json_fid = pjoin(self.cache_path, "dset_peek.json")
+        ## Label cache files.
+        # Needed for UI
+        self.fig_labels_json_fid = pjoin(self.cache_path, "fig_labels.json")
+        ## Length cache files
+        # Needed for UI
+        self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
+        # Needed for UI
+        self.length_stats_json_fid = pjoin(self.cache_path, "length_stats.json")
+        self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
+        # Needed for UI
+        self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
+        # Needed for UI
+        self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
+        ## General text stats
+        # Needed for UI
+        self.general_stats_json_fid = pjoin(self.cache_path, "general_stats_dict.json")
+        # Needed for UI
+        self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
+                                             "sorted_top_vocab.feather")
+        ## Zipf cache files
+        # Needed for UI
+        self.zipf_fid = pjoin(self.cache_path, "zipf_basic_stats.json")
+        # Needed for UI
+        self.zipf_fig_fid = pjoin(self.cache_path, "zipf_fig.json")
+        ## Embeddings cache files
+        # Needed for UI
+        self.node_list_fid = pjoin(self.cache_path, "node_list.th")
+        # Needed for UI
+        self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
+        self.zipf_counts = None
+        self.live = False
+    def set_deployment(self, live=True):
+        """
+        Function that we can hit when we deploy, so that cache files are not
+        written out/recalculated, but instead that part of the UI can be punted.
+        """
+        self.live = live
+    def get_base_dataset(self):
+        """Gets a pointer to the truncated base dataset object."""
+        if not self.dset:
+            self.dset = load_truncated_dataset(
+                self.dset_name,
+                self.dset_config,
+                self.split_name,
+                cache_name=self.dset_fid,
+                use_cache=True,
+                use_streaming=True,
+            )
+    def load_or_prepare_general_stats(self, save=True):
+        """
+        Content for expander_general_stats widget.
+        Provides statistics for total words, total open words,
+        the sorted top vocab, the NaN count, and the duplicate count.
+        Args:
+        Returns:
+        """
+        # General statistics
+        if (
+            self.use_cache
+            and exists(self.general_stats_json_fid)
+            and exists(self.dup_counts_df_fid)
+            and exists(self.sorted_top_vocab_df_fid)
+        ):
+            logs.info('Loading cached general stats')
+            self.load_general_stats()
+        else:
+            if not self.live:
+                logs.info('Preparing general stats')
+                self.prepare_general_stats()
+                if save:
+                    write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
+                    write_json(self.general_stats_dict, self.general_stats_json_fid)
+    def load_or_prepare_text_lengths(self, save=True):
+        """
+        The text length widget relies on this function, which provides
+        a figure of the text lengths, some text length statistics, and
+        a text length dataframe to peruse.
+        Args:
+            save:
+        Returns:
+        """
+        # Text length figure
+        if (self.use_cache and exists(self.fig_tok_length_fid)):
+            self.fig_tok_length_png = mpimg.imread(self.fig_tok_length_fid)
+            self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
+        else:
+            if not self.live:
+                self.prepare_fig_text_lengths()
+                if save:
+                    write_plotly(self.fig_tok_length, self.fig_tok_length_fid)
+        # Text length dataframe
+        if self.use_cache and exists(self.length_df_fid):
+            self.length_df = feather.read_feather(self.length_df_fid)
+        else:
+            if not self.live:
+                self.prepare_length_df()
+                if save:
+                    write_df(self.length_df, self.length_df_fid)
+        # Text length stats.
+        if self.use_cache and exists(self.length_stats_json_fid):
+            with open(self.length_stats_json_fid, "r") as f:
+                self.length_stats_dict = json.load(f)
+            self.avg_length = self.length_stats_dict["avg length"]
+            self.std_length = self.length_stats_dict["std length"]
+            self.num_uniq_lengths = self.length_stats_dict["num lengths"]
+        else:
+            if not self.live:
+                self.prepare_text_length_stats()
+                if save:
+                    write_json(self.length_stats_dict, self.length_stats_json_fid)
+    def prepare_length_df(self):
+        if not self.live:
+            if self.tokenized_df is None:
+                self.tokenized_df = self.do_tokenization()
+            self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
+                TOKENIZED_FIELD].apply(len)
+            self.length_df = self.tokenized_df[
+                [LENGTH_FIELD, OUR_TEXT_FIELD]].sort_values(
+                by=[LENGTH_FIELD], ascending=True
+            )
+    def prepare_text_length_stats(self):
+        if not self.live:
+            if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
+                self.prepare_length_df()
+            avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
+            self.avg_length = round(avg_length, 1)
+            std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
+            self.std_length = round(std_length, 1)
+            self.num_uniq_lengths = len(self.length_df["length"].unique())
+            self.length_stats_dict = {"avg length": self.avg_length,
+                                      "std length": self.std_length,
+                                      "num lengths": self.num_uniq_lengths}
+    def prepare_fig_text_lengths(self):
+        if not self.live:
+            if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
+                self.prepare_length_df()
+            self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
+    def load_or_prepare_embeddings(self, save=True):
+        if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
+            self.node_list = torch.load(self.node_list_fid)
+            self.fig_tree = read_plotly(self.fig_tree_json_fid)
+        elif self.use_cache and exists(self.node_list_fid):
+            self.node_list = torch.load(self.node_list_fid)
+            self.fig_tree = make_tree_plot(self.node_list,
+                                           self.text_dset)
+            if save:
+                write_plotly(self.fig_tree, self.fig_tree_json_fid)
+        else:
+            self.embeddings = Embeddings(self, use_cache=self.use_cache)
+            self.embeddings.make_hierarchical_clustering()
+            self.node_list = self.embeddings.node_list
+            self.fig_tree = make_tree_plot(self.node_list,
+                                           self.text_dset)
+            if save:
+                torch.save(self.node_list, self.node_list_fid)
+                write_plotly(self.fig_tree, self.fig_tree_json_fid)
+    # get vocab with word counts
+    def load_or_prepare_vocab(self, save=True):
+        """
+        Calculates the vocabulary count from the tokenized text.
+        The resulting dataframes may be used in nPMI calculations, zipf, etc.
+        :param
+        :return:
+        """
+        if (
+            self.use_cache
+            and exists(self.vocab_counts_df_fid)
+        ):
+            logs.info("Reading vocab from cache")
+            self.load_vocab()
+            self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
+        else:
+            logs.info("Calculating vocab afresh")
+            if len(self.tokenized_df) == 0:
+                self.tokenized_df = self.do_tokenization()
+                if save:
+                    logs.info("Writing out.")
+                    write_df(self.tokenized_df, self.tokenized_df_fid)
+            word_count_df = count_vocab_frequencies(self.tokenized_df)
+            logs.info("Making dfs with proportion.")
+            self.vocab_counts_df = calc_p_word(word_count_df)
+            self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
+            if save:
+                logs.info("Writing out.")
+                write_df(self.vocab_counts_df, self.vocab_counts_df_fid)
+        logs.info("unfiltered vocab")
+        logs.info(self.vocab_counts_df)
+        logs.info("filtered vocab")
+        logs.info(self.vocab_counts_filtered_df)
+    def load_vocab(self):
+        with open(self.vocab_counts_df_fid, "rb") as f:
+            self.vocab_counts_df = feather.read_feather(f)
+        # Handling for changes in how the index is saved.
+        self.vocab_counts_df = self._set_idx_col_names(self.vocab_counts_df)
+    def load_or_prepare_text_duplicates(self, save=True):
+        if self.use_cache and exists(self.dup_counts_df_fid):
+            with open(self.dup_counts_df_fid, "rb") as f:
+                self.dup_counts_df = feather.read_feather(f)
+        elif self.dup_counts_df is None:
+            if not self.live:
+                self.prepare_text_duplicates()
+                if save:
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
+        else:
+            if not self.live:
+                # This happens when self.dup_counts_df is already defined;
+                # This happens when general_statistics were calculated first,
+                # since general statistics requires the number of duplicates
+                if save:
+                    write_df(self.dup_counts_df, self.dup_counts_df_fid)
+    def load_general_stats(self):
+        self.general_stats_dict = json.load(open(self.general_stats_json_fid, encoding="utf-8"))
+        with open(self.sorted_top_vocab_df_fid, "rb") as f:
+            self.sorted_top_vocab_df = feather.read_feather(f)
+        self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
+        self.dedup_total = self.general_stats_dict[DEDUP_TOT]
+        self.total_words = self.general_stats_dict[TOT_WORDS]
+        self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
+    def prepare_general_stats(self):
+        if not self.live:
+            if self.tokenized_df is None:
+                logs.warning("Tokenized dataset not yet loaded; doing so.")
+                self.load_or_prepare_dataset()
+            if self.vocab_counts_df is None:
+                logs.warning("Vocab not yet loaded; doing so.")
+                self.load_or_prepare_vocab()
+            self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
+                "count", ascending=False
+            ).head(_TOP_N)
+            self.total_words = len(self.vocab_counts_df)
+            self.total_open_words = len(self.vocab_counts_filtered_df)
+            self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
+            self.prepare_text_duplicates()
+            self.dedup_total = sum(self.dup_counts_df[CNT])
+            self.general_stats_dict = {
+                TOT_WORDS: self.total_words,
+                TOT_OPEN_WORDS: self.total_open_words,
+                TEXT_NAN_CNT: self.text_nan_count,
+                DEDUP_TOT: self.dedup_total,
+            }
+    def prepare_text_duplicates(self):
+        if not self.live:
+            if self.tokenized_df is None:
+                self.load_or_prepare_tokenized_df()
+            dup_df = self.tokenized_df[
+                self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
+            self.dup_counts_df = pd.DataFrame(
+                dup_df.pivot_table(
+                    columns=[OUR_TEXT_FIELD], aggfunc="size"
+                ).sort_values(ascending=False),
+                columns=[CNT],
+            )
+            self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
+    def load_or_prepare_dataset(self, save=True):
+        """
+        Prepares the HF datasets and data frames containing the untokenized and
+        tokenized text as well as the label values.
+        self.tokenized_df is used further for calculating text lengths,
+        word counts, etc.
+        Args:
+            save: Store the calculated data to disk.
+        Returns:
+        """
+        logs.info("Doing text dset.")
+        self.load_or_prepare_text_dset(save)
+        logs.info("Doing tokenized dataframe")
+        self.load_or_prepare_tokenized_df(save)
+        logs.info("Doing dataset peek")
+        self.load_or_prepare_dset_peek(save)
+    def load_or_prepare_dset_peek(self, save=True):
+        if self.use_cache and exists(self.dset_peek_json_fid):
+            with open(self.dset_peek_json_fid, "r") as f:
+                self.dset_peek = json.load(f)["dset peek"]
+        else:
+            if self.dset is None:
+                self.get_base_dataset()
+            self.dset_peek = self.dset[:100]
+            if save:
+                write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
+    def load_or_prepare_tokenized_df(self, save=True):
+        if (self.use_cache and exists(self.tokenized_df_fid)):
+            self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
+        else:
+            if not self.live:
+                # tokenize all text instances
+                self.tokenized_df = self.do_tokenization()
+                if save:
+                    logs.warning("Saving tokenized dataset to disk")
+                    # save tokenized text
+                    write_df(self.tokenized_df, self.tokenized_df_fid)
+    def load_or_prepare_text_dset(self, save=True):
+        if (self.use_cache and exists(self.text_dset_fid)):
+            # load extracted text
+            self.text_dset = load_from_disk(self.text_dset_fid)
+            logs.warning("Loaded dataset from disk")
+            logs.info(self.text_dset)
+        # ...Or load it from the server and store it anew
+        else:
+            if not self.live:
+                self.prepare_text_dset()
+                if save:
+                    # save extracted text instances
+                    logs.warning("Saving dataset to disk")
+                    self.text_dset.save_to_disk(self.text_dset_fid)
+    def prepare_text_dset(self):
+        if not self.live:
+            self.get_base_dataset()
+            # extract all text instances
+            self.text_dset = self.dset.map(
+                lambda examples: extract_field(
+                    examples, self.text_field, OUR_TEXT_FIELD
+                ),
+                batched=True,
+                remove_columns=list(self.dset.features),
+            )
+    def do_tokenization(self):
+        """
+        Tokenizes the dataset
+        :return:
+        """
+        if self.text_dset is None:
+            self.load_or_prepare_text_dset()
+        sent_tokenizer = self.cvec.build_tokenizer()
+        def tokenize_batch(examples):
+            # TODO: lowercase should be an option
+            res = {
+                TOKENIZED_FIELD: [
+                    tuple(sent_tokenizer(text.lower()))
+                    for text in examples[OUR_TEXT_FIELD]
+                ]
+            }
+            res[LENGTH_FIELD] = [len(tok_text) for tok_text in res[TOKENIZED_FIELD]]
+            return res
+        tokenized_dset = self.text_dset.map(
+            tokenize_batch,
+            batched=True,
+            # remove_columns=[OUR_TEXT_FIELD], keep around to print
+        )
+        tokenized_df = pd.DataFrame(tokenized_dset)
+        return tokenized_df
+    def set_label_field(self, label_field="label"):
+        """
+        Setter for label_field. Used in the CLI when a user asks for information
+         about labels, but does not specify the field;
+         'label' is assumed as a default.
+        """
+        self.label_field = label_field
+    def load_or_prepare_labels(self, save=True):
+        # TODO: This is in a transitory state for creating fig cache.
+        # Clean up to be caching and reading everything correctly.
+        """
+        Extracts labels from the Dataset
+        :return:
+        """
+        # extracted labels
+        if len(self.label_field) > 0:
+            if self.use_cache and exists(self.fig_labels_json_fid):
+                self.fig_labels = read_plotly(self.fig_labels_json_fid)
+            elif self.use_cache and exists(self.label_dset_fid):
+                # load extracted labels
+                self.label_dset = load_from_disk(self.label_dset_fid)
+                self.label_df = self.label_dset.to_pandas()
+                self.fig_labels = make_fig_labels(
+                    self.label_df, self.label_names, OUR_LABEL_FIELD
+                )
+                if save:
+                    write_plotly(self.fig_labels, self.fig_labels_json_fid)
+            else:
+                if not self.live:
+                    self.prepare_labels()
+                    if save:
+                        # save extracted label instances
+                        self.label_dset.save_to_disk(self.label_dset_fid)
+                        write_plotly(self.fig_labels, self.fig_labels_json_fid)
+    def prepare_labels(self):
+        if not self.live:
+            self.get_base_dataset()
+            self.label_dset = self.dset.map(
+                lambda examples: extract_field(
+                    examples, self.label_field, OUR_LABEL_FIELD
+                ),
+                batched=True,
+                remove_columns=list(self.dset.features),
+            )
+            self.label_df = self.label_dset.to_pandas()
+            self.fig_labels = make_fig_labels(
+                self.label_df, self.label_names, OUR_LABEL_FIELD
+            )
+    def load_or_prepare_npmi(self):
+        self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
+        self.npmi_stats.load_or_prepare_npmi_terms()
+    def load_or_prepare_zipf(self, save=True):
+        # TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
+        # when only reading from cache. Either the UI should use it, or it should
+        # be removed when reading in cache
+        if self.use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
+            with open(self.zipf_fid, "r") as f:
+                zipf_dict = json.load(f)
+            self.z = Zipf()
+            self.z.load(zipf_dict)
+            # TODO: Should this be cached?
+            self.zipf_counts = self.z.calc_zipf_counts(self.vocab_counts_df)
+            self.zipf_fig = read_plotly(self.zipf_fig_fid)
+        elif self.use_cache and exists(self.zipf_fid):
+            # TODO: Read zipf data so that the vocab is there.
+            with open(self.zipf_fid, "r") as f:
+                zipf_dict = json.load(f)
+            self.z = Zipf()
+            self.z.load(zipf_dict)
+            self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
+            if save:
+                write_plotly(self.zipf_fig, self.zipf_fig_fid)
+        else:
+            self.z = Zipf(self.vocab_counts_df)
+            self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
+            if save:
+                write_zipf_data(self.z, self.zipf_fid)
+                write_plotly(self.zipf_fig, self.zipf_fig_fid)
+    def _set_idx_col_names(self, input_vocab_df):
+        if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
+            input_vocab_df = input_vocab_df.set_index([VOCAB])
+            input_vocab_df[VOCAB] = input_vocab_df.index
+        return input_vocab_df
+class nPMIStatisticsCacheClass:
+    """ "Class to interface between the app and the nPMI class
+    by calling the nPMI class with the user's selections."""
+    def __init__(self, dataset_stats, use_cache=False):
+        self.live = dataset_stats.live
+        self.dstats = dataset_stats
+        self.pmi_cache_path = pjoin(self.dstats.cache_path, "pmi_files")
+        if not isdir(self.pmi_cache_path):
+            logs.warning("Creating pmi cache directory %s." % self.pmi_cache_path)
+            # We need to preprocess everything.
+            mkdir(self.pmi_cache_path)
+        self.joint_npmi_df_dict = {}
+        # TODO: Users ideally can type in whatever words they want.
+        self.termlist = _IDENTITY_TERMS
+        # termlist terms that are available more than _MIN_VOCAB_COUNT times
+        self.available_terms = _IDENTITY_TERMS
+        logs.info(self.termlist)
+        self.use_cache = use_cache
+        # TODO: Let users specify
+        self.open_class_only = True
+        self.min_vocab_count = self.dstats.min_vocab_count
+        self.subgroup_files = {}
+        self.npmi_terms_fid = pjoin(self.dstats.cache_path, "npmi_terms.json")
+    def load_or_prepare_npmi_terms(self):
+        """
+        Figures out what identity terms the user can select, based on whether
+        they occur more than self.min_vocab_count times
+        :return: Identity terms occurring at least self.min_vocab_count times.
+        """
+        # TODO: Add the user's ability to select subgroups.
+        # TODO: Make min_vocab_count here value selectable by the user.
+        if (
+            self.use_cache
+            and exists(self.npmi_terms_fid)
+            and json.load(open(self.npmi_terms_fid))["available terms"] != []
+        ):
+            self.available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
+        else:
+            if not self.live:
+                if self.dstats.vocab_counts_df is None:
+                    self.dstats.load_or_prepare_vocab()
+                true_false = [
+                    term in self.dstats.vocab_counts_df.index for term in self.termlist
+                ]
+                word_list_tmp = [x for x, y in zip(self.termlist, true_false) if y]
+                true_false_counts = [
+                    self.dstats.vocab_counts_df.loc[word, CNT] >= self.min_vocab_count
+                    for word in word_list_tmp
+                ]
+                available_terms = [
+                    word for word, y in zip(word_list_tmp, true_false_counts) if y
+                ]
+                logs.info(available_terms)
+                with open(self.npmi_terms_fid, "w+") as f:
+                    json.dump({"available terms": available_terms}, f)
+            self.available_terms = available_terms
+        return self.available_terms
+    def load_or_prepare_joint_npmi(self, subgroup_pair, save=True):
+        """
+        Run on-the fly, while the app is already open,
+        as it depends on the subgroup terms that the user chooses
+        :param subgroup_pair:
+        :return:
+        """
+        # Canonical ordering for subgroup_list
+        subgroup_pair = sorted(subgroup_pair)
+        subgroup1 = subgroup_pair[0]
+        subgroup2 = subgroup_pair[1]
+        subgroups_str = "-".join(subgroup_pair)
+        if not isdir(self.pmi_cache_path):
+            logs.warning("Creating cache")
+            # We need to preprocess everything.
+            # This should eventually all go into a prepare_dataset CLI
+            mkdir(self.pmi_cache_path)
+        joint_npmi_fid = pjoin(self.pmi_cache_path, subgroups_str + "_npmi.csv")
+        subgroup_files = define_subgroup_files(subgroup_pair, self.pmi_cache_path)
+        # Defines the filenames for the cache files from the selected subgroups.
+        # Get as much precomputed data as we can.
+        if self.use_cache and exists(joint_npmi_fid):
+            # When everything is already computed for the selected subgroups.
+            logs.info("Loading cached joint npmi")
+            joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
+            npmi_display_cols = ['npmi-bias', subgroup1 + '-npmi', subgroup2 + '-npmi', subgroup1 + '-count', subgroup2 + '-count']
+            joint_npmi_df = joint_npmi_df[npmi_display_cols]
+            # When maybe some things have been computed for the selected subgroups.
+        else:
+            if not self.live:
+                logs.info("Preparing new joint npmi")
+                joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
+                    subgroup_pair, subgroup_files
+                )
+                if save:
+                    if joint_npmi_df is not None:
+                        # Cache new results
+                        logs.info("Writing out.")
+                        for subgroup in subgroup_pair:
+                            write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
+                        with open(joint_npmi_fid, "w+") as f:
+                            joint_npmi_df.to_csv(f)
+            else:
+                joint_npmi_df = pd.DataFrame()
+        logs.info("The joint npmi df is")
+        logs.info(joint_npmi_df)
+        return joint_npmi_df
+    def load_joint_npmi_df(self, joint_npmi_fid):
+        """
+        Reads in a saved dataframe with all of the paired results.
+        :param joint_npmi_fid:
+        :return: paired results
+        """
+        with open(joint_npmi_fid, "rb") as f:
+            joint_npmi_df = pd.read_csv(f)
+        joint_npmi_df = self._set_idx_cols_from_cache(joint_npmi_df)
+        return joint_npmi_df.dropna()
+    def prepare_joint_npmi_df(self, subgroup_pair, subgroup_files):
+        """
+        Computs the npmi bias based on the given subgroups.
+        Handles cases where some of the selected subgroups have cached nPMI
+        computations, but other's don't, computing everything afresh if there
+        are not cached files.
+        :param subgroup_pair:
+        :return: Dataframe with nPMI for the words, nPMI bias between the words.
+        """
+        subgroup_dict = {}
+        # When npmi is computed for some (but not all) of subgroup_list
+        for subgroup in subgroup_pair:
+            logs.info("Load or failing...")
+            # When subgroup npmi has been computed in a prior session.
+            cached_results = self.load_or_fail_cached_npmi_scores(
+                subgroup, subgroup_files[subgroup]
+            )
+            # If the function did not return False and we did find it, use.
+            if cached_results:
+                # FYI: subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df = cached_results
+                # Holds the previous sessions' data for use in this session.
+                subgroup_dict[subgroup] = cached_results
+        logs.info("Calculating for subgroup list")
+        joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
+        return joint_npmi_df, subgroup_dict
+    # TODO: Update pairwise assumption
+    def do_npmi(self, subgroup_pair, subgroup_dict):
+        """
+        Calculates nPMI for given identity terms and the nPMI bias between.
+        :param subgroup_pair: List of identity terms to calculate the bias for
+        :return: Subset of data for the UI
+        :return: Selected identity term's co-occurrence counts with
+                 other words, pmi per word, and nPMI per word.
+        """
+        no_results = False
+        logs.info("Initializing npmi class")
+        npmi_obj = self.set_npmi_obj()
+        # Canonical ordering used
+        subgroup_pair = tuple(sorted(subgroup_pair))
+        # Calculating nPMI statistics
+        for subgroup in subgroup_pair:
+            # If the subgroup data is already computed, grab it.
+            # TODO: Should we set idx and column names similarly to
+            #  how we set them for cached files?
+            if subgroup not in subgroup_dict:
+                logs.info("Calculating statistics for %s" % subgroup)
+                vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
+                if vocab_cooc_df is None:
+                    no_results = True
+                else:
+                    # Store the nPMI information for the current subgroups
+                    subgroup_dict[subgroup] = (vocab_cooc_df, pmi_df, npmi_df)
+        if no_results:
+            logs.warning("Couldn't grap the npmi files -- Under construction")
+            return None, None
+        else:
+            # Pair the subgroups together, indexed by all words that
+            # co-occur between them.
+            logs.info("Computing pairwise npmi bias")
+            paired_results = npmi_obj.calc_paired_metrics(subgroup_pair, subgroup_dict)
+            UI_results = make_npmi_fig(paired_results, subgroup_pair)
+        return UI_results.dropna(), subgroup_dict
+    def set_npmi_obj(self):
+        """
+        Initializes the nPMI class with the given words and tokenized sentences.
+        :return:
+        """
+        npmi_obj = nPMI(self.dstats.vocab_counts_df, self.dstats.tokenized_df)
+        return npmi_obj
+    def load_or_fail_cached_npmi_scores(self, subgroup, subgroup_fids):
+        """
+        Reads cached scores from the specified subgroup files
+        :param subgroup: string of the selected identity term
+        :return:
+        """
+        # TODO: Ordering of npmi, pmi, vocab triple should be consistent
+        subgroup_npmi_fid, subgroup_pmi_fid, subgroup_cooc_fid = subgroup_fids
+        if (
+            exists(subgroup_npmi_fid)
+            and exists(subgroup_pmi_fid)
+            and exists(subgroup_cooc_fid)
+        ):
+            logs.info("Reading in pmi data....")
+            with open(subgroup_cooc_fid, "rb") as f:
+                subgroup_cooc_df = pd.read_csv(f)
+            logs.info("pmi")
+            with open(subgroup_pmi_fid, "rb") as f:
+                subgroup_pmi_df = pd.read_csv(f)
+            logs.info("npmi")
+            with open(subgroup_npmi_fid, "rb") as f:
+                subgroup_npmi_df = pd.read_csv(f)
+            subgroup_cooc_df = self._set_idx_cols_from_cache(
+                subgroup_cooc_df, subgroup, "count"
+            )
+            subgroup_pmi_df = self._set_idx_cols_from_cache(
+                subgroup_pmi_df, subgroup, "pmi"
+            )
+            subgroup_npmi_df = self._set_idx_cols_from_cache(
+                subgroup_npmi_df, subgroup, "npmi"
+            )
+            return subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df
+        return False
+    def _set_idx_cols_from_cache(self, csv_df, subgroup=None, calc_str=None):
+        """
+        Helps make sure all of the read-in files can be accessed within code
+        via standardized indices and column names.
+        :param csv_df:
+        :param subgroup:
+        :param calc_str:
+        :return:
+        """
+        # The csv saves with this column instead of the index, so that's weird.
+        if "Unnamed: 0" in csv_df.columns:
+            csv_df = csv_df.set_index("Unnamed: 0")
+            csv_df.index.name = WORD
+        elif WORD in csv_df.columns:
+            csv_df = csv_df.set_index(WORD)
+            csv_df.index.name = WORD
+        elif VOCAB in csv_df.columns:
+            csv_df = csv_df.set_index(VOCAB)
+            csv_df.index.name = WORD
+        if subgroup and calc_str:
+            csv_df.columns = [subgroup + "-" + calc_str]
+        elif subgroup:
+            csv_df.columns = [subgroup]
+        elif calc_str:
+            csv_df.columns = [calc_str]
+        return csv_df
+    def get_available_terms(self):
+        return self.load_or_prepare_npmi_terms()
+def dummy(doc):
+    return doc
+def count_vocab_frequencies(tokenized_df):
+    """
+    Based on an input pandas DataFrame with a 'text' column,
+    this function will count the occurrences of all words.
+    :return: [num_words x num_sentences] DataFrame with the rows corresponding to the
+    different vocabulary words and the column to the presence (0 or 1) of that word.
+    """
+    cvec = CountVectorizer(
+        tokenizer=dummy,
+        preprocessor=dummy,
+    )
+    # We do this to calculate per-word statistics
+    # Fast calculation of single word counts
+    logs.info("Fitting dummy tokenization to make matrix using the previous tokenization")
+    cvec.fit(tokenized_df[TOKENIZED_FIELD])
+    document_matrix = cvec.transform(tokenized_df[TOKENIZED_FIELD])
+    batches = np.linspace(0, tokenized_df.shape[0], _NUM_VOCAB_BATCHES).astype(int)
+    i = 0
+    tf = []
+    while i < len(batches) - 1:
+        logs.info("%s of %s vocab batches" % (str(i), str(len(batches))))
+        batch_result = np.sum(
+            document_matrix[batches[i] : batches[i + 1]].toarray(), axis=0
+        )
+        tf.append(batch_result)
+        i += 1
+    word_count_df = pd.DataFrame(
+        [np.sum(tf, axis=0)], columns=cvec.get_feature_names()
+    ).transpose()
+    # Now organize everything into the dataframes
+    word_count_df.columns = [CNT]
+    word_count_df.index.name = WORD
+    return word_count_df
+def calc_p_word(word_count_df):
+    # p(word)
+    word_count_df[PROP] = word_count_df[CNT] / float(sum(word_count_df[CNT]))
+    vocab_counts_df = pd.DataFrame(word_count_df.sort_values(by=CNT, ascending=False))
+    vocab_counts_df[VOCAB] = vocab_counts_df.index
+    return vocab_counts_df
+def filter_vocab(vocab_counts_df):
+    # TODO: Add warnings (which words are missing) to log file?
+    filtered_vocab_counts_df = vocab_counts_df.drop(_CLOSED_CLASS,
+                                                    errors="ignore")
+    filtered_count = filtered_vocab_counts_df[CNT]
+    filtered_count_denom = float(sum(filtered_vocab_counts_df[CNT]))
+    filtered_vocab_counts_df[PROP] = filtered_count / filtered_count_denom
+    return filtered_vocab_counts_df
+## Figures ##
+def write_plotly(fig, fid):
+    write_json(plotly.io.to_json(fig), fid)
+def read_plotly(fid):
+    fig = plotly.io.from_json(json.load(open(fid, encoding="utf-8")))
+    return fig
+def make_fig_lengths(tokenized_df, length_field):
+    fig_tok_length = px.histogram(
+        tokenized_df, x=length_field, marginal="rug", hover_data=[length_field]
+    )
+    return fig_tok_length
+def make_fig_labels(label_df, label_names, label_field):
+    labels = label_df[label_field].unique()
+    label_sums = [len(label_df[label_df[label_field] == label]) for label in labels]
+    fig_labels = px.pie(label_df, values=label_sums, names=label_names)
+    return fig_labels
+def make_zipf_fig_ranked_word_list(vocab_df, unique_counts, unique_ranks):
+    ranked_words = {}
+    for count, rank in zip(unique_counts, unique_ranks):
+        vocab_df[vocab_df[CNT] == count]["rank"] = rank
+        ranked_words[rank] = ",".join(
+            vocab_df[vocab_df[CNT] == count].index.astype(str)
+        )  # Use the hovertext kw argument for hover text
+    ranked_words_list = [wrds for rank, wrds in sorted(ranked_words.items())]
+    return ranked_words_list
+def make_npmi_fig(paired_results, subgroup_pair):
+    subgroup1, subgroup2 = subgroup_pair
+    UI_results = pd.DataFrame()
+    if "npmi-bias" in paired_results:
+        UI_results["npmi-bias"] = paired_results["npmi-bias"].astype(float)
+    UI_results[subgroup1 + "-npmi"] = paired_results["npmi"][
+        subgroup1 + "-npmi"
+    ].astype(float)
+    UI_results[subgroup1 + "-count"] = paired_results["count"][
+        subgroup1 + "-count"
+    ].astype(int)
+    if subgroup1 != subgroup2:
+        UI_results[subgroup2 + "-npmi"] = paired_results["npmi"][
+            subgroup2 + "-npmi"
+        ].astype(float)
+        UI_results[subgroup2 + "-count"] = paired_results["count"][
+            subgroup2 + "-count"
+        ].astype(int)
+    return UI_results.sort_values(by="npmi-bias", ascending=True)
+def make_zipf_fig(vocab_counts_df, z):
+    zipf_counts = z.calc_zipf_counts(vocab_counts_df)
+    unique_counts = z.uniq_counts
+    unique_ranks = z.uniq_ranks
+    ranked_words_list = make_zipf_fig_ranked_word_list(
+        vocab_counts_df, unique_counts, unique_ranks
+    )
+    zmin = z.get_xmin()
+    logs.info("zipf counts is")
+    logs.info(zipf_counts)
+    layout = go.Layout(xaxis=dict(range=[0, 100]))
+    fig = go.Figure(
+        data=[
+            go.Bar(
+                x=z.uniq_ranks,
+                y=z.uniq_counts,
+                hovertext=ranked_words_list,
+                name="Word Rank Frequency",
+            )
+        ],
+        layout=layout,
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=z.uniq_ranks[zmin : len(z.uniq_ranks)],
+            y=zipf_counts[zmin : len(z.uniq_ranks)],
+            hovertext=ranked_words_list[zmin : len(z.uniq_ranks)],
+            line=go.scatter.Line(color="crimson", width=3),
+            name="Zipf Predicted Frequency",
+        )
+    )
+    # Customize aspect
+    # fig.update_traces(marker_color='limegreen',
+    #                  marker_line_width=1.5, opacity=0.6)
+    fig.update_layout(title_text="Word Counts, Observed and Predicted by Zipf")
+    fig.update_layout(xaxis_title="Word Rank")
+    fig.update_layout(yaxis_title="Frequency")
+    fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.10))
+    return fig
+def make_tree_plot(node_list, text_dset):
+    nid_map = dict([(node["nid"], nid) for nid, node in enumerate(node_list)])
+    for nid, node in enumerate(node_list):
+        node["label"] = node.get(
+            "label",
+            f"{nid:2d} - {node['weight']:5d} items <br>"
+            + "<br>".join(
+                [
+                    "> " + txt[:64] + ("..." if len(txt) >= 63 else "")
+                    for txt in list(
+                        set(text_dset.select(node["example_ids"])[OUR_TEXT_FIELD])
+                    )[:5]
+                ]
+            ),
+        )
+    # make plot nodes
+    # TODO: something more efficient than set to remove duplicates
+    labels = [node["label"] for node in node_list]
+    root = node_list[0]
+    root["X"] = 0
+    root["Y"] = 0
+    def rec_make_coordinates(node):
+        total_weight = 0
+        add_weight = len(node["example_ids"]) - sum(
+            [child["weight"] for child in node["children"]]
+        )
+        for child in node["children"]:
+            child["X"] = node["X"] + total_weight
+            child["Y"] = node["Y"] - 1
+            total_weight += child["weight"] + add_weight / len(node["children"])
+            rec_make_coordinates(child)
+    rec_make_coordinates(root)
+    E = []  # list of edges
+    Xn = []
+    Yn = []
+    Xe = []
+    Ye = []
+    for nid, node in enumerate(node_list):
+        Xn += [node["X"]]
+        Yn += [node["Y"]]
+        for child in node["children"]:
+            E += [(nid, nid_map[child["nid"]])]
+            Xe += [node["X"], child["X"], None]
+            Ye += [node["Y"], child["Y"], None]
+    # make figure
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=Xe,
+            y=Ye,
+            mode="lines",
+            line=dict(color="rgb(210,210,210)", width=1),
+            hoverinfo="none",
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=Xn,
+            y=Yn,
+            mode="markers",
+            name="nodes",
+            marker=dict(
+                symbol="circle-dot",
+                size=18,
+                color="#6175c1",
+                line=dict(color="rgb(50,50,50)", width=1)
+                # '#DB4551',
+            ),
+            text=labels,
+            hoverinfo="text",
+            opacity=0.8,
+        )
+    )
+    return fig
+## Input/Output ###
+def define_subgroup_files(subgroup_list, pmi_cache_path):
+    """
+    Sets the file ids for the input identity terms
+    :param subgroup_list: List of identity terms
+    :return:
+    """
+    subgroup_files = {}
+    for subgroup in subgroup_list:
+        # TODO: Should the pmi, npmi, and count just be one file?
+        subgroup_npmi_fid = pjoin(pmi_cache_path, subgroup + "_npmi.csv")
+        subgroup_pmi_fid = pjoin(pmi_cache_path, subgroup + "_pmi.csv")
+        subgroup_cooc_fid = pjoin(pmi_cache_path, subgroup + "_vocab_cooc.csv")
+        subgroup_files[subgroup] = (
+            subgroup_npmi_fid,
+            subgroup_pmi_fid,
+            subgroup_cooc_fid,
+        )
+    return subgroup_files
+## Input/Output ##
+def intersect_dfs(df_dict):
+    started = 0
+    new_df = None
+    for key, df in df_dict.items():
+        if df is None:
+            continue
+        for key2, df2 in df_dict.items():
+            if df2 is None:
+                continue
+            if key == key2:
+                continue
+            if started:
+                new_df = new_df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+            else:
+                new_df = df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+                started = 1
+    return new_df.copy()
+def write_df(df, df_fid):
+    feather.write_feather(df, df_fid)
+def write_json(json_dict, json_fid):
+    with open(json_fid, "w", encoding="utf-8") as f:
+        json.dump(json_dict, f)
+def write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files):
+    """
+    Saves the calculated nPMI statistics to their output files.
+    Includes the npmi scores for each identity term, the pmi scores, and the
+    co-occurrence counts of the identity term with all the other words
+    :param subgroup: Identity term
+    :return:
+    """
+    subgroup_fids = subgroup_files[subgroup]
+    subgroup_npmi_fid, subgroup_pmi_fid, subgroup_cooc_fid = subgroup_fids
+    subgroup_dfs = subgroup_dict[subgroup]
+    subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df = subgroup_dfs
+    with open(subgroup_npmi_fid, "w+") as f:
+        subgroup_npmi_df.to_csv(f)
+    with open(subgroup_pmi_fid, "w+") as f:
+        subgroup_pmi_df.to_csv(f)
+    with open(subgroup_cooc_fid, "w+") as f:
+        subgroup_cooc_df.to_csv(f)
+def write_zipf_data(z, zipf_fid):
+    zipf_dict = {}
+    zipf_dict["xmin"] = int(z.xmin)
+    zipf_dict["xmax"] = int(z.xmax)
+    zipf_dict["alpha"] = float(z.alpha)
+    zipf_dict["ks_distance"] = float(z.distance)
+    zipf_dict["p-value"] = float(z.ks_test.pvalue)
+    zipf_dict["uniq_counts"] = [int(count) for count in z.uniq_counts]
+    zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
+    with open(zipf_fid, "w+", encoding="utf-8") as f:
+        json.dump(zipf_dict, f)