{ "cells": [ { "cell_type": "markdown", "source": [ "### Load packages" ], "metadata": { "id": "utSDkGUL101i" }, "id": "utSDkGUL101i" }, { "cell_type": "code", "execution_count": 48, "id": "34299990-bd58-4fe9-99fe-15d4b6796106", "metadata": { "id": "34299990-bd58-4fe9-99fe-15d4b6796106", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e99e0fdc-27ee-4e6f-bc64-18f6127b9b3a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.19.1)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.14.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.25.2)\n", "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (14.0.2)\n", "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.0.3)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n", "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.4)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n", "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2024.3.1)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.5)\n", "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.1)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets) (4.11.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2024.2.2)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", "Requirement already satisfied: datatrove in /usr/local/lib/python3.10/dist-packages (0.2.0)\n", "Requirement already satisfied: dill>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.3.8)\n", "Requirement already satisfied: fsspec>=2023.12.2 in /usr/local/lib/python3.10/dist-packages (from datatrove) (2024.3.1)\n", "Requirement already satisfied: huggingface-hub>=0.17.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.23.1)\n", "Requirement already satisfied: humanize in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.7.0)\n", "Requirement already satisfied: loguru>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.7.2)\n", "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.70.16)\n", "Requirement already satisfied: numpy>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (1.25.2)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.66.4)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (3.14.0)\n", "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (24.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (6.0.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (2.31.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (4.11.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2024.2.2)\n" ] } ], "source": [ "!pip install datasets\n", "!pip install datatrove\n", "import datasets\n", "import json\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from datatrove.pipeline.readers import ParquetReader" ] }, { "cell_type": "code", "execution_count": 49, "id": "922a0454", "metadata": { "id": "922a0454", "outputId": "8500a12a-6856-46ac-bb65-6f86db4bb001", "colab": { "base_uri": "https://localhost:8080/" } }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The rich extension is already loaded. To reload it, use:\n", " %reload_ext rich\n" ] } ], "source": [ "%load_ext rich" ] }, { "cell_type": "markdown", "id": "703c7781-0a33-41dc-8da9-2fa034483cad", "metadata": { "id": "703c7781-0a33-41dc-8da9-2fa034483cad" }, "source": [ "## Methodology\n", "\n", "In order to measure bias in the dataset, we consider the following simple [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) based approach. The idea is that the specificity of a term -- in our case, how `biased` it is -- can be quantified as an inverse function of the number of documents in which it occurs.\n", "\n", "Given a dataset and terms for a subpopulation (gender) of interest:\n", "1. Evaluate Inverse Document Frequencies on the full dataset\n", "2. Compute the average TF-IDF vectors for the dataset for a given subpopulation (gender)\n", "3. Sort the terms by variance to see words that are much more likely to appear specifically for a given subpopulation\n", "\n", "\n" ] }, { "cell_type": "markdown", "id": "7c837c65-987f-45cf-b18d-fc7836894372", "metadata": { "id": "7c837c65-987f-45cf-b18d-fc7836894372" }, "source": [ "### Load Fineweb\n" ] }, { "cell_type": "code", "execution_count": null, "id": "dbd19018", "metadata": { "id": "dbd19018", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "2852efb2-954f-460f-d143-18baa0408973" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "\u001b[32m2024-05-29 19:38:01.457\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdatatrove.pipeline.readers.base\u001b[0m:\u001b[36mread_files_shard\u001b[0m:\u001b[36m193\u001b[0m - \u001b[1mReading input file 000_00000.parquet\u001b[0m\n" ] } ], "source": [ "local = False\n", "data_reader = ParquetReader(\"hf://datasets/HuggingFaceFW/fineweb/sample/10BT\")\n", "all_docs = [document.text for document in data_reader()]" ] }, { "cell_type": "markdown", "source": [ "### Compute frequencies" ], "metadata": { "id": "eBj1TtiW2C-6" }, "id": "eBj1TtiW2C-6" }, { "cell_type": "code", "source": [ "# Step 1: get Inverse document frequencies for the dataset\n", "vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')\n", "full_tfidf = vectorizer.fit_transform(all_docs)\n", "tfidf_feature_names = np.array(vectorizer.get_feature_names_out())" ], "metadata": { "id": "e_nQogiWceYZ" }, "id": "e_nQogiWceYZ", "execution_count": 50, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Bias analysis: Gender tf-idf" ], "metadata": { "id": "aqIybwilj0KH" }, "id": "aqIybwilj0KH" }, { "cell_type": "code", "source": [ "# Step 2: get average TF-IDF vectors **for each gender**\n", "GENDER_PHRASES = [\"man\", \"woman\"]\n", "tfidf_by_gender = {}\n", "for phrase in GENDER_PHRASES:\n", " gdr_docs = [doc for doc in all_docs if phrase in doc.split()]\n", " if gdr_docs != []:\n", " gdr_tfidf = np.asarray(vectorizer.transform(gdr_docs).mean(axis=0))[0]\n", " tfidf_by_gender[phrase] = gdr_tfidf" ], "metadata": { "id": "d-Na79jvczt0" }, "id": "d-Na79jvczt0", "execution_count": 51, "outputs": [] }, { "cell_type": "code", "source": [ "# Step 3: for each term, compute the variance across genders\n", "all_tfidf = np.array(list(tfidf_by_gender.values()))\n", "tf_idf_var = all_tfidf - all_tfidf.sum(axis=0, keepdims=True)\n", "tf_idf_var = np.power((tf_idf_var * tf_idf_var).sum(axis=0), 0.5)\n", "sort_by_variance = tf_idf_var.argsort()[::-1]" ], "metadata": { "id": "D0sbbLyWw2CZ" }, "id": "D0sbbLyWw2CZ", "execution_count": 52, "outputs": [] }, { "cell_type": "code", "execution_count": 53, "id": "03393fe5-2a92-451a-bd08-6a27a6239097", "metadata": { "id": "03393fe5-2a92-451a-bd08-6a27a6239097" }, "outputs": [], "source": [ "# Create the data structure for the visualization,\n", "# showing the highest variance words for each gender,\n", "# and how they deviate from the mean\n", "pre_pandas_lines = [\n", " {\n", " \"word\": tfidf_feature_names[w],\n", " \"man\": all_tfidf[0, w],\n", " \"woman\": all_tfidf[1, w],\n", " \"man+\": all_tfidf[0, w] - all_tfidf[:, w].mean(),\n", " \"woman+\": all_tfidf[1, w] - all_tfidf[:, w].mean(),\n", " \"variance\": tf_idf_var[w],\n", " \"total\": all_tfidf[:, w].sum(),\n", " }\n", " for w in sort_by_variance[:50]\n", "]" ] }, { "cell_type": "markdown", "source": [ "### Results" ], "metadata": { "id": "IhJC-iT91smy" }, "id": "IhJC-iT91smy" }, { "cell_type": "code", "source": [ "# Plot\n", "df = pd.DataFrame.from_dict(pre_pandas_lines)\n", "df.style.background_gradient(\n", " axis=None,\n", " vmin=0,\n", " vmax=0.2,\n", " cmap=\"YlGnBu\"\n", ").format(precision=2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "LDLjFa6HdMWe", "outputId": "d012172a-4c03-4505-83c6-7bd6c3c77a91" }, "id": "LDLjFa6HdMWe", "execution_count": 47, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [], "text/html": [ "
\n"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eb340\u001b[0m\u001b[1m>\u001b[0m"
            ],
            "text/html": [
              "\n",
              "\n",
              "  \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "  \n",
              "  \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "  \n",
              "
 wordmanwomanman+woman+variancetotal
0woman0.010.07-0.030.030.070.08
1man0.050.020.01-0.010.050.07
2women0.010.04-0.010.010.040.06
3said0.030.010.01-0.010.030.05
4people0.020.020.00-0.000.030.04
5tsa0.010.03-0.010.010.030.04
6life0.030.010.01-0.010.030.04
7just0.020.020.00-0.000.030.04
8police0.020.020.00-0.000.030.04
9god0.020.020.00-0.000.030.04
10like0.020.020.00-0.000.030.04
11cancer0.000.03-0.010.010.030.03
12marriage0.020.02-0.000.000.030.04
13time0.020.020.00-0.000.030.04
14mouse0.000.03-0.010.010.030.03
15rudy0.010.02-0.010.010.020.03
16gangnam0.010.02-0.010.010.020.03
17medical0.000.02-0.010.010.020.03
18world0.010.02-0.000.000.020.03
19work0.010.02-0.000.000.020.03
20make0.020.010.00-0.000.020.03
21think0.020.010.00-0.000.020.03
22palin0.010.02-0.000.000.020.03
23john0.010.02-0.000.000.020.03
24surgery0.000.02-0.010.010.020.02
25anderson0.000.02-0.010.010.020.02
26day0.010.010.00-0.000.020.03
27gregory0.010.02-0.000.000.020.03
28st0.010.02-0.000.000.020.03
29hermit0.010.02-0.000.000.020.03
30says0.010.010.00-0.000.020.03
31know0.010.010.00-0.000.020.03
32use0.010.02-0.010.010.020.03
33plus0.000.02-0.010.010.020.02
34size0.000.02-0.010.010.020.02
35year0.010.010.00-0.000.020.03
36don0.010.01-0.000.000.020.03
37died0.010.02-0.010.010.020.02
38left0.010.01-0.000.000.020.03
39did0.010.01-0.000.000.020.03
40white0.000.02-0.010.010.020.02
41right0.020.010.00-0.000.020.03
42wife0.010.02-0.000.000.020.02
43sir0.010.02-0.000.000.020.03
44way0.020.010.00-0.000.020.03
45great0.010.010.00-0.000.020.03
46city0.010.01-0.000.000.020.03
47korean0.010.01-0.000.000.020.03
48camera0.000.02-0.010.010.020.02
49place0.020.010.00-0.000.020.02
\n" ] }, "metadata": {}, "execution_count": 47 } ] }, { "cell_type": "markdown", "id": "e273abff-3d81-431f-9188-82d87d1ecda2", "metadata": { "id": "e273abff-3d81-431f-9188-82d87d1ecda2" }, "source": [ "#### Sorting by bias\n", "\n", "In order to better surface biases, we can sort the table by how much one gender over-represents a term.\n", "\n", "In this case, we see that instances mentioning `man` are more likely to include `god` than those mentioning `woman`, which in turn are more likely to include `cancer`." ] }, { "cell_type": "code", "execution_count": 45, "id": "34229f06-5bf7-4ece-b43e-7d453931abd4", "metadata": { "id": "34229f06-5bf7-4ece-b43e-7d453931abd4", "outputId": "7720b46d-a37d-4007-aa8e-8d7973f4f91c", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "collapsed": true }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [], "text/html": [ "
\n"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eac20\u001b[0m\u001b[1m>\u001b[0m"
            ],
            "text/html": [
              "\n",
              "\n",
              "  \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "  \n",
              "  \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "  \n",
              "
 wordmanwomanman+woman+variancetotal
1man0.050.020.01-0.010.050.07
3said0.030.010.01-0.010.030.05
6life0.030.010.01-0.010.030.04
9god0.020.020.00-0.000.030.04
7just0.020.020.00-0.000.030.04
10like0.020.020.00-0.000.030.04
44way0.020.010.00-0.000.020.03
21think0.020.010.00-0.000.020.03
49place0.020.010.00-0.000.020.02
41right0.020.010.00-0.000.020.03
13time0.020.020.00-0.000.030.04
35year0.010.010.00-0.000.020.03
31know0.010.010.00-0.000.020.03
20make0.020.010.00-0.000.020.03
4people0.020.020.00-0.000.030.04
8police0.020.020.00-0.000.030.04
26day0.010.010.00-0.000.020.03
30says0.010.010.00-0.000.020.03
45great0.010.010.00-0.000.020.03
46city0.010.01-0.000.000.020.03
39did0.010.01-0.000.000.020.03
36don0.010.01-0.000.000.020.03
28st0.010.02-0.000.000.020.03
38left0.010.01-0.000.000.020.03
23john0.010.02-0.000.000.020.03
18world0.010.02-0.000.000.020.03
47korean0.010.01-0.000.000.020.03
43sir0.010.02-0.000.000.020.03
12marriage0.020.02-0.000.000.030.04
19work0.010.02-0.000.000.020.03
29hermit0.010.02-0.000.000.020.03
27gregory0.010.02-0.000.000.020.03
22palin0.010.02-0.000.000.020.03
42wife0.010.02-0.000.000.020.02
16gangnam0.010.02-0.010.010.020.03
15rudy0.010.02-0.010.010.020.03
32use0.010.02-0.010.010.020.03
37died0.010.02-0.010.010.020.02
5tsa0.010.03-0.010.010.030.04
40white0.000.02-0.010.010.020.02
34size0.000.02-0.010.010.020.02
48camera0.000.02-0.010.010.020.02
33plus0.000.02-0.010.010.020.02
17medical0.000.02-0.010.010.020.03
24surgery0.000.02-0.010.010.020.02
25anderson0.000.02-0.010.010.020.02
14mouse0.000.03-0.010.010.030.03
2women0.010.04-0.010.010.040.06
11cancer0.000.03-0.010.010.030.03
0woman0.010.07-0.030.030.070.08
\n" ] }, "metadata": {}, "execution_count": 45 } ], "source": [ "df.sort_values('man+', ascending=False).style.background_gradient(\n", " axis=None,\n", " vmin=0,\n", " vmax=0.2,\n", " cmap=\"YlGnBu\"\n", ").format(precision=2)" ] }, { "cell_type": "code", "source": [ "df.sort_values('woman+', ascending=False).style.background_gradient(\n", " axis=None,\n", " vmin=0,\n", " vmax=0.2,\n", " cmap=\"YlGnBu\"\n", ").format(precision=2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "ufATwOCojOdv", "outputId": "299fdb81-a754-4afe-b0fd-5be8aac8c549" }, "id": "ufATwOCojOdv", "execution_count": 46, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [], "text/html": [ "
\n"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eab60\u001b[0m\u001b[1m>\u001b[0m"
            ],
            "text/html": [
              "\n",
              "\n",
              "  \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "  \n",
              "  \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "    \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "      \n",
              "    \n",
              "  \n",
              "
 wordmanwomanman+woman+variancetotal
0woman0.010.07-0.030.030.070.08
11cancer0.000.03-0.010.010.030.03
2women0.010.04-0.010.010.040.06
14mouse0.000.03-0.010.010.030.03
25anderson0.000.02-0.010.010.020.02
24surgery0.000.02-0.010.010.020.02
17medical0.000.02-0.010.010.020.03
33plus0.000.02-0.010.010.020.02
48camera0.000.02-0.010.010.020.02
34size0.000.02-0.010.010.020.02
40white0.000.02-0.010.010.020.02
5tsa0.010.03-0.010.010.030.04
37died0.010.02-0.010.010.020.02
32use0.010.02-0.010.010.020.03
15rudy0.010.02-0.010.010.020.03
16gangnam0.010.02-0.010.010.020.03
42wife0.010.02-0.000.000.020.02
22palin0.010.02-0.000.000.020.03
27gregory0.010.02-0.000.000.020.03
29hermit0.010.02-0.000.000.020.03
19work0.010.02-0.000.000.020.03
12marriage0.020.02-0.000.000.030.04
43sir0.010.02-0.000.000.020.03
47korean0.010.01-0.000.000.020.03
18world0.010.02-0.000.000.020.03
23john0.010.02-0.000.000.020.03
38left0.010.01-0.000.000.020.03
28st0.010.02-0.000.000.020.03
36don0.010.01-0.000.000.020.03
39did0.010.01-0.000.000.020.03
46city0.010.01-0.000.000.020.03
45great0.010.010.00-0.000.020.03
30says0.010.010.00-0.000.020.03
26day0.010.010.00-0.000.020.03
8police0.020.020.00-0.000.030.04
4people0.020.020.00-0.000.030.04
20make0.020.010.00-0.000.020.03
31know0.010.010.00-0.000.020.03
35year0.010.010.00-0.000.020.03
13time0.020.020.00-0.000.030.04
41right0.020.010.00-0.000.020.03
49place0.020.010.00-0.000.020.02
21think0.020.010.00-0.000.020.03
44way0.020.010.00-0.000.020.03
10like0.020.020.00-0.000.030.04
7just0.020.020.00-0.000.030.04
9god0.020.020.00-0.000.030.04
6life0.030.010.01-0.010.030.04
3said0.030.010.01-0.010.030.05
1man0.050.020.01-0.010.050.07
\n" ] }, "metadata": {}, "execution_count": 46 } ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" }, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 5 }