File size: 10,531 Bytes
9b51db9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1cd6af
9b51db9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1cd6af
 
9b51db9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Might be nice to print to log instead? Happens when we drop closed class.
warnings.filterwarnings(action="ignore", category=UserWarning)
# When we divide by 0 in log
np.seterr(divide="ignore")

# treating inf values as NaN as well
pd.set_option("use_inf_as_na", True)

logs = logging.getLogger(__name__)
logs.setLevel(logging.INFO)
logs.propagate = False

if not logs.handlers:

    Path('./log_files').mkdir(exist_ok=True)

    # Logging info to log file
    file = logging.FileHandler("./log_files/npmi.log")
    fileformat = logging.Formatter("%(asctime)s:%(message)s")
    file.setLevel(logging.INFO)
    file.setFormatter(fileformat)

    # Logging debug messages to stream
    stream = logging.StreamHandler()
    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
    stream.setLevel(logging.WARNING)
    stream.setFormatter(streamformat)

    logs.addHandler(file)
    logs.addHandler(stream)

_NUM_BATCHES = 500


class nPMI:
    # TODO: Expand beyond pairwise
    def __init__(
        self,
        vocab_counts_df,
        tokenized_df,
        tokenized_col_name="tokenized_text",
        num_batches=_NUM_BATCHES,
    ):
        logs.info("Initiating npmi class.")
        logs.info("vocab is")
        logs.info(vocab_counts_df)
        self.vocab_counts_df = vocab_counts_df
        logs.info("tokenized is")
        self.tokenized_df = tokenized_df
        logs.info(self.tokenized_df)
        self.tokenized_col_name = tokenized_col_name
        # self.mlb_list holds num batches x num_sentences
        self.mlb_list = []

    def binarize_words_in_sentence(self):
        logs.info("Creating co-occurrence matrix for PMI calculations.")
        batches = np.linspace(0, self.tokenized_df.shape[0], _NUM_BATCHES).astype(int)
        i = 0
        # Creates list of size (# batches x # sentences)
        while i < len(batches) - 1:
            # Makes a sparse matrix (shape: # sentences x # words),
            # with the occurrence of each word per sentence.
            mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index)
            logs.info(
                "%s of %s sentence binarize batches." % (str(i), str(len(batches)))
            )
            # Returns series: batch size x num_words
            mlb_series = mlb.fit_transform(
                self.tokenized_df[self.tokenized_col_name][batches[i] : batches[i + 1]]
            )
            i += 1
            self.mlb_list.append(mlb_series)

    def calc_cooccurrences(self, subgroup, subgroup_idx):
        initialize = True
        coo_df = None
        # Big computation here!  Should only happen once.
        logs.info(
            "Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences."
        )
        if not self.mlb_list:
            self.binarize_words_in_sentence()
        for batch_id in range(len(self.mlb_list)):
            logs.info(
                "%s of %s co-occurrence count batches"
                % (str(batch_id), str(len(self.mlb_list)))
            )
            # List of all the sentences (list of vocab) in that batch
            batch_sentence_row = self.mlb_list[batch_id]
            # Dataframe of # sentences in batch x vocabulary size
            sent_batch_df = pd.DataFrame(batch_sentence_row)
            # logs.info('sent batch df is')
            # logs.info(sent_batch_df)
            # Subgroup counts per-sentence for the given batch
            subgroup_df = sent_batch_df[subgroup_idx]
            subgroup_df.columns = [subgroup]
            # Remove the sentences where the count of the subgroup is 0.
            # This way we have less computation & resources needs.
            subgroup_df = subgroup_df[subgroup_df > 0]
            logs.info("Removing 0 counts, subgroup_df is")
            logs.info(subgroup_df)
            mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
            logs.info("mlb subgroup only is")
            logs.info(mlb_subgroup_only)
            # Create cooccurrence matrix for the given subgroup and all words.
            logs.info("Now we do the T.dot approach for co-occurrences")
            batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))

            # Creates a batch-sized dataframe of co-occurrence counts.
            # Note these could just be summed rather than be batch size.
            if initialize:
                coo_df = batch_coo_df
            else:
                coo_df = coo_df.add(batch_coo_df, fill_value=0)
            logs.info("coo_df is")
            logs.info(coo_df)
            initialize = False
        logs.info("Returning co-occurrence matrix")
        logs.info(coo_df)
        return pd.DataFrame(coo_df)

    def calc_paired_metrics(self, subgroup_pair, subgroup_npmi_dict):
        """
        Calculates nPMI metrics between paired subgroups.
        Special handling for a subgroup paired with itself.
        :param subgroup_npmi_dict:
        :return:
        """
        paired_results_dict = {"npmi": {}, "pmi": {}, "count": {}}
        # Canonical ordering. This is done previously, but just in case...
        subgroup1, subgroup2 = sorted(subgroup_pair)
        vocab_cooc_df1, pmi_df1, npmi_df1 = subgroup_npmi_dict[subgroup1]
        logs.info("vocab cooc")
        logs.info(vocab_cooc_df1)
        if subgroup1 == subgroup2:
            shared_npmi_df = npmi_df1
            shared_pmi_df = pmi_df1
            shared_vocab_cooc_df = vocab_cooc_df1
        else:
            vocab_cooc_df2, pmi_df2, npmi_df2 = subgroup_npmi_dict[subgroup2]
            logs.info("vocab cooc2")
            logs.info(vocab_cooc_df2)
            # Note that lsuffix and rsuffix should not come into play.
            shared_npmi_df = npmi_df1.join(
                npmi_df2, how="inner", lsuffix="1", rsuffix="2"
            )
            shared_pmi_df = pmi_df1.join(pmi_df2, how="inner", lsuffix="1", rsuffix="2")
            shared_vocab_cooc_df = vocab_cooc_df1.join(
                vocab_cooc_df2, how="inner", lsuffix="1", rsuffix="2"
            )
            shared_vocab_cooc_df = shared_vocab_cooc_df.dropna()
            shared_vocab_cooc_df = shared_vocab_cooc_df[
                shared_vocab_cooc_df.index.notnull()
            ]
            logs.info("shared npmi df")
            logs.info(shared_npmi_df)
            logs.info("shared vocab df")
            logs.info(shared_vocab_cooc_df)
        npmi_bias = (
            shared_npmi_df[subgroup1 + "-npmi"] - shared_npmi_df[subgroup2 + "-npmi"]
        )
        paired_results_dict["npmi-bias"] = npmi_bias.dropna()
        paired_results_dict["npmi"] = shared_npmi_df.dropna()
        paired_results_dict["pmi"] = shared_pmi_df.dropna()
        paired_results_dict["count"] = shared_vocab_cooc_df.dropna()
        return paired_results_dict

    def calc_metrics(self, subgroup):
        # Index of the subgroup word in the sparse vector
        subgroup_idx = self.vocab_counts_df.index.get_loc(subgroup)
        logs.info("Calculating co-occurrences...")
        df_coo = self.calc_cooccurrences(subgroup, subgroup_idx)
        vocab_cooc_df = self.set_idx_cols(df_coo, subgroup)
        logs.info(vocab_cooc_df)
        logs.info("Calculating PMI...")
        pmi_df = self.calc_PMI(vocab_cooc_df, subgroup)
        logs.info(pmi_df)
        logs.info("Calculating nPMI...")
        npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup)
        logs.info(npmi_df)
        return vocab_cooc_df, pmi_df, npmi_df

    def set_idx_cols(self, df_coo, subgroup):
        """
        :param df_coo: Co-occurrence counts for subgroup, length is num_words
        :return:
        """
        count_df = df_coo.set_index(self.vocab_counts_df.index)
        count_df.columns = [subgroup + "-count"]
        count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(int)
        return count_df

    def calc_PMI(self, vocab_cooc_df, subgroup):
        """
        # PMI(x;y) = h(y) - h(y|x)
        #          = h(subgroup) - h(subgroup|word)
        #          = log (p(subgroup|word) / p(subgroup))
        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
        """
        # Calculation of p(subgroup)
        subgroup_prob = self.vocab_counts_df.loc[subgroup]["proportion"]
        # Calculation of p(subgroup|word) = count(subgroup,word) / count(word)
        # Because the inidices match (the vocab words),
        # this division doesn't need to specify the index (I think?!)
        p_subgroup_g_word = (
            vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df["count"]
        )
        logs.info("p_subgroup_g_word is")
        logs.info(p_subgroup_g_word)
        pmi_df = pd.DataFrame()
        pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob)
        # Note: A potentially faster solution for adding count, npmi,
        # can be based on this zip idea:
        # df_test['size_kb'],  df_test['size_mb'], df_test['size_gb'] =
        # zip(*df_test['size'].apply(sizes))
        return pmi_df.dropna()

    def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup):
        """
        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
        #                                           = -log(p(word|subgroup)p(word))
        """
        p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum(
            vocab_cooc_df[subgroup + "-count"]
        )
        p_word = pmi_df.apply(
            lambda x: self.vocab_counts_df.loc[x.name]["proportion"], axis=1
        )
        normalize_pmi = -np.log(p_word_g_subgroup * p_word)
        npmi_df = pd.DataFrame()
        npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi
        return npmi_df.dropna()