diff --git "a/Final file.ipynb" "b/Final file.ipynb" new file mode 100644--- /dev/null +++ "b/Final file.ipynb" @@ -0,0 +1,1452 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gczeIWL7Yqml", + "outputId": "353b3804-fb2b-4ead-d190-69cc7ef11ea6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting datasets\n", + " Downloading datasets-2.18.0-py3-none-any.whl (510 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/510.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.4/510.5 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m501.8/510.5 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.13.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.25.2)\n", + "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (14.0.2)\n", + "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n", + "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", + " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.2)\n", + "Collecting xxhash (from datasets)\n", + " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting multiprocess (from datasets)\n", + " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.2.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.3)\n", + "Requirement already satisfied: huggingface-hub>=0.19.4 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.20.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.4->datasets) (4.10.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2024.2.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n", + "Installing collected packages: xxhash, dill, multiprocess, datasets\n", + "Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16 xxhash-3.4.1\n", + "--2024-03-16 03:52:00-- https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 7502 (7.3K) [text/plain]\n", + "Saving to: ‘conlleval.py’\n", + "\n", + "conlleval.py 100%[===================>] 7.33K --.-KB/s in 0s \n", + "\n", + "2024-03-16 03:52:00 (96.5 MB/s) - ‘conlleval.py’ saved [7502/7502]\n", + "\n" + ] + } + ], + "source": [ + "!pip3 install datasets\n", + "!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py\n" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install presidio-analyzer" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "x5eLSkVqlhh6", + "outputId": "9cf46693-5e60-425d-8693-22a5df24fea0" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting presidio-analyzer\n", + " Downloading presidio_analyzer-2.2.353-py3-none-any.whl (85 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.7/85.7 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<4.0.0,>=3.4.4 in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer) (3.7.4)\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer) (2023.12.25)\n", + "Collecting tldextract (from presidio-analyzer)\n", + " Downloading tldextract-5.1.1-py3-none-any.whl (97 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.7/97.7 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer) (6.0.1)\n", + "Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)\n", + " Downloading phonenumbers-8.13.32-py2.py3-none-any.whl (2.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (8.2.3)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.1.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.0.10)\n", + "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.3.4)\n", + "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.9.0)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (6.4.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (4.66.2)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.31.0)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.6.4)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.1.3)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (67.7.2)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (24.0)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.3.0)\n", + "Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.25.2)\n", + "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from tldextract->presidio-analyzer) (3.6)\n", + "Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)\n", + " Downloading requests_file-2.0.0-py2.py3-none-any.whl (4.2 kB)\n", + "Requirement already satisfied: filelock>=3.0.8 in /usr/local/lib/python3.10/dist-packages (from tldextract->presidio-analyzer) (3.13.1)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.3 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.16.3)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.4.4->presidio-analyzer) (4.10.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.3.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2024.2.2)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.1.4)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.10/dist-packages (from typer<0.10.0,>=0.3.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (8.1.7)\n", + "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from weasel<0.4.0,>=0.1.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.1.5)\n", + "Installing collected packages: phonenumbers, requests-file, tldextract, presidio-analyzer\n", + "Successfully installed phonenumbers-8.13.32 presidio-analyzer-2.2.353 requests-file-2.0.0 tldextract-5.1.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install flair" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "XWdmM-gGmHV-", + "outputId": "42e0e840-89df-4a99-a3d7-8d78fc7beff0" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting flair\n", + " Downloading flair-0.13.1-py3-none-any.whl (388 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m388.3/388.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting boto3>=1.20.27 (from flair)\n", + " Downloading boto3-1.34.64-py3-none-any.whl (139 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.3/139.3 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting bpemb>=0.3.2 (from flair)\n", + " Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)\n", + "Collecting conllu>=4.0 (from flair)\n", + " Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)\n", + "Collecting deprecated>=1.2.13 (from flair)\n", + " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", + "Collecting ftfy>=6.1.0 (from flair)\n", + " Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.4/54.4 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: gdown>=4.4.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.7.3)\n", + "Requirement already satisfied: gensim>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.3.2)\n", + "Requirement already satisfied: huggingface-hub>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from flair) (0.20.3)\n", + "Collecting janome>=0.4.2 (from flair)\n", + " Downloading Janome-0.5.0-py2.py3-none-any.whl (19.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.7/19.7 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langdetect>=1.0.9 (from flair)\n", + " Downloading langdetect-1.0.9.tar.gz (981 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m67.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: lxml>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.9.4)\n", + "Requirement already satisfied: matplotlib>=2.2.3 in /usr/local/lib/python3.10/dist-packages (from flair) (3.7.1)\n", + "Requirement already satisfied: more-itertools>=8.13.0 in /usr/local/lib/python3.10/dist-packages (from flair) (10.1.0)\n", + "Collecting mpld3>=0.3 (from flair)\n", + " Downloading mpld3-0.5.10-py3-none-any.whl (202 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m202.6/202.6 kB\u001b[0m \u001b[31m23.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pptree>=3.1 (from flair)\n", + " Downloading pptree-3.1.tar.gz (3.0 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from flair) (2.8.2)\n", + "Collecting pytorch-revgrad>=0.2.0 (from flair)\n", + " Downloading pytorch_revgrad-0.2.0-py3-none-any.whl (4.6 kB)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from flair) (2023.12.25)\n", + "Requirement already satisfied: scikit-learn>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from flair) (1.2.2)\n", + "Collecting segtok>=1.5.11 (from flair)\n", + " Downloading segtok-1.5.11-py3-none-any.whl (24 kB)\n", + "Collecting sqlitedict>=2.0.0 (from flair)\n", + " Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: tabulate>=0.8.10 in /usr/local/lib/python3.10/dist-packages (from flair) (0.9.0)\n", + "Requirement already satisfied: torch!=1.8,>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from flair) (2.2.1+cu121)\n", + "Requirement already satisfied: tqdm>=4.63.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.66.2)\n", + "Collecting transformer-smaller-training-vocab>=0.2.3 (from flair)\n", + " Downloading transformer_smaller_training_vocab-0.3.3-py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: transformers[sentencepiece]<5.0.0,>=4.18.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.38.2)\n", + "Collecting urllib3<2.0.0,>=1.0.0 (from flair)\n", + " Downloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting wikipedia-api>=0.5.7 (from flair)\n", + " Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)\n", + "Collecting semver<4.0.0,>=3.0.0 (from flair)\n", + " Downloading semver-3.0.2-py3-none-any.whl (17 kB)\n", + "Collecting botocore<1.35.0,>=1.34.64 (from boto3>=1.20.27->flair)\n", + " Downloading botocore-1.34.64-py3-none-any.whl (12.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.0/12.0 MB\u001b[0m \u001b[31m68.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.20.27->flair)\n", + " Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n", + "Collecting s3transfer<0.11.0,>=0.10.0 (from boto3>=1.20.27->flair)\n", + " Downloading s3transfer-0.10.1-py3-none-any.whl (82 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.2/82.2 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from bpemb>=0.3.2->flair) (1.25.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from bpemb>=0.3.2->flair) (2.31.0)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from bpemb>=0.3.2->flair) (0.1.99)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated>=1.2.13->flair) (1.14.1)\n", + "Requirement already satisfied: wcwidth<0.3.0,>=0.2.12 in /usr/local/lib/python3.10/dist-packages (from ftfy>=6.1.0->flair) (0.2.13)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from gdown>=4.4.0->flair) (3.13.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from gdown>=4.4.0->flair) (1.16.0)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from gdown>=4.4.0->flair) (4.12.3)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim>=4.2.0->flair) (1.11.4)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim>=4.2.0->flair) (6.4.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (2023.6.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (4.10.0)\n", + "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (24.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (4.49.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (1.4.5)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (3.1.2)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from mpld3>=0.3->flair) (3.1.3)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.2->flair) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.2->flair) (3.3.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch!=1.8,>=1.5.0->flair) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch!=1.8,>=1.5.0->flair) (3.2.1)\n", + "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m45.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m72.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m906.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━��━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nccl-cu12==2.19.3 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: triton==2.2.0 in /usr/local/lib/python3.10/dist-packages (from torch!=1.8,>=1.5.0->flair) (2.2.0)\n", + "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m70.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (0.15.2)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (0.4.2)\n", + "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (3.20.3)\n", + "Collecting accelerate>=0.21.0 (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair)\n", + " Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.1/290.1 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->gdown>=4.4.0->flair) (2.5)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->mpld3>=0.3->flair) (2.1.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (3.6)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (2024.2.2)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (1.7.1)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch!=1.8,>=1.5.0->flair) (1.3.0)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (5.9.5)\n", + "Building wheels for collected packages: langdetect, pptree, sqlitedict\n", + " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=9da87eaaff56d6d1421c337ae61089e29874a2de99038123b209c2cf6ffe4791\n", + " Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n", + " Building wheel for pptree (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pptree: filename=pptree-3.1-py3-none-any.whl size=4609 sha256=be019012224ff0981466d5ef57193c243fbbc1542c10b46f3ed8f17e84f74b0e\n", + " Stored in directory: /root/.cache/pip/wheels/9f/b6/0e/6f26eb9e6eb53ff2107a7888d72b5a6a597593956113037828\n", + " Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16862 sha256=056a323511a15e5bdbb990ad53b061cdb301623fdbf4a77ead3f71402b27bf97\n", + " Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n", + "Successfully built langdetect pptree sqlitedict\n", + "Installing collected packages: sqlitedict, pptree, janome, urllib3, semver, segtok, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, langdetect, jmespath, ftfy, deprecated, conllu, nvidia-cusparse-cu12, nvidia-cudnn-cu12, botocore, wikipedia-api, s3transfer, nvidia-cusolver-cu12, mpld3, bpemb, boto3, pytorch-revgrad, accelerate, transformer-smaller-training-vocab, flair\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 2.0.7\n", + " Uninstalling urllib3-2.0.7:\n", + " Successfully uninstalled urllib3-2.0.7\n", + "Successfully installed accelerate-0.28.0 boto3-1.34.64 botocore-1.34.64 bpemb-0.3.4 conllu-4.5.3 deprecated-1.2.14 flair-0.13.1 ftfy-6.2.0 janome-0.5.0 jmespath-1.0.1 langdetect-1.0.9 mpld3-0.5.10 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 pptree-3.1 pytorch-revgrad-0.2.0 s3transfer-0.10.1 segtok-1.5.11 semver-3.0.2 sqlitedict-2.1.0 transformer-smaller-training-vocab-0.3.3 urllib3-1.26.18 wikipedia-api-0.6.0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "urllib3" + ] + }, + "id": "dfdc4a89fa71429587bc109f13908415" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n", + "\n", + "import os\n", + "import keras\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from keras import layers\n", + "from datasets import load_dataset\n", + "from collections import Counter\n", + "from conlleval import evaluate\n", + "\n", + "import pandas as pd\n", + "from google.colab import files\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from transformers import AutoModel, AutoTokenizer\n", + "\n", + "import logging\n", + "from typing import Optional, List, Tuple, Set\n", + "from presidio_analyzer import (\n", + " RecognizerResult,\n", + " EntityRecognizer,\n", + " AnalysisExplanation,\n", + ")\n", + "from presidio_analyzer.nlp_engine import NlpArtifacts\n", + "\n", + "from flair.data import Sentence\n", + "from flair.models import SequenceTagger\n" + ], + "metadata": { + "id": "9FxNt5pZY0e2" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class TransformerBlock(layers.Layer):\n", + " def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):\n", + " super().__init__()\n", + " self.att = keras.layers.MultiHeadAttention(\n", + " num_heads=num_heads, key_dim=embed_dim\n", + " )\n", + " self.ffn = keras.Sequential(\n", + " [\n", + " keras.layers.Dense(ff_dim, activation=\"relu\"),\n", + " keras.layers.Dense(embed_dim),\n", + " ]\n", + " )\n", + " self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)\n", + " self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)\n", + " self.dropout1 = keras.layers.Dropout(rate)\n", + " self.dropout2 = keras.layers.Dropout(rate)\n", + "\n", + " def call(self, inputs, training=False):\n", + " attn_output = self.att(inputs, inputs)\n", + " attn_output = self.dropout1(attn_output, training=training)\n", + " out1 = self.layernorm1(inputs + attn_output)\n", + " ffn_output = self.ffn(out1)\n", + " ffn_output = self.dropout2(ffn_output, training=training)\n", + " return self.layernorm2(out1 + ffn_output)\n" + ], + "metadata": { + "id": "a2ro_nntY-FC" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class TokenAndPositionEmbedding(layers.Layer):\n", + " def __init__(self, maxlen, vocab_size, embed_dim):\n", + " super().__init__()\n", + " self.token_emb = keras.layers.Embedding(\n", + " input_dim=vocab_size, output_dim=embed_dim\n", + " )\n", + " self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)\n", + "\n", + " def call(self, inputs):\n", + " maxlen = tf.shape(inputs)[-1]\n", + " positions = tf.range(start=0, limit=maxlen, delta=1)\n", + " position_embeddings = self.pos_emb(positions)\n", + " token_embeddings = self.token_emb(inputs)\n", + " return token_embeddings + position_embeddings" + ], + "metadata": { + "id": "jg0WkejPZBn8" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class NERModel(keras.Model):\n", + " def __init__(\n", + " self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32\n", + " ):\n", + " super().__init__()\n", + " self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)\n", + " self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)\n", + " self.dropout1 = layers.Dropout(0.1)\n", + " self.ff = layers.Dense(ff_dim, activation=\"relu\")\n", + " self.dropout2 = layers.Dropout(0.1)\n", + " self.ff_final = layers.Dense(num_tags, activation=\"softmax\")\n", + "\n", + " def call(self, inputs, training=False):\n", + " x = self.embedding_layer(inputs)\n", + " x = self.transformer_block(x)\n", + " x = self.dropout1(x, training=training)\n", + " x = self.ff(x)\n", + " x = self.dropout2(x, training=training)\n", + " x = self.ff_final(x)\n", + " return x" + ], + "metadata": { + "id": "HeMvk_zKZFXy" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "conll_data = load_dataset(\"conll2003\")\n" + ], + "metadata": { + "id": "weGmhigxZMaT" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def dataset_to_dataframe(dataset):\n", + " data_dict = {key: dataset[key] for key in dataset.features}\n", + " return pd.DataFrame(data_dict)\n", + "\n", + "# Combine all splits (train, validation, test) into a single DataFrame\n", + "conll_df = pd.concat([dataset_to_dataframe(conll_data[split]) for split in conll_data.keys()])" + ], + "metadata": { + "id": "SEvvIFAgdcAF" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "csv_file_path = \"conll_data.csv\"\n", + "conll_df.to_csv(csv_file_path, index=False)\n", + "\n", + "# Download the CSV file to local machine\n", + "\n", + "files.download(csv_file_path)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 17 + }, + "id": "UejgBp-Ng_l_", + "outputId": "98b45e90-3b08-4857-f7eb-42e9a319eb29" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " async function download(id, filename, size) {\n", + " if (!google.colab.kernel.accessAllowed) {\n", + " return;\n", + " }\n", + " const div = document.createElement('div');\n", + " const label = document.createElement('label');\n", + " label.textContent = `Downloading \"${filename}\": `;\n", + " div.appendChild(label);\n", + " const progress = document.createElement('progress');\n", + " progress.max = size;\n", + " div.appendChild(progress);\n", + " document.body.appendChild(div);\n", + "\n", + " const buffers = [];\n", + " let downloaded = 0;\n", + "\n", + " const channel = await google.colab.kernel.comms.open(id);\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + "\n", + " for await (const message of channel.messages) {\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + " if (message.buffers) {\n", + " for (const buffer of message.buffers) {\n", + " buffers.push(buffer);\n", + " downloaded += buffer.byteLength;\n", + " progress.value = downloaded;\n", + " }\n", + " }\n", + " }\n", + " const blob = new Blob(buffers, {type: 'application/binary'});\n", + " const a = document.createElement('a');\n", + " a.href = window.URL.createObjectURL(blob);\n", + " a.download = filename;\n", + " div.appendChild(a);\n", + " a.click();\n", + " div.remove();\n", + " }\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "download(\"download_e9dfb994-0d94-46a0-a16d-296a01070e4a\", \"conll_data.csv\", 6111395)" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.head())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NRyX6MExgwi7", + "outputId": "78702706-75b5-4d4e-9cf7-08f21bb99dcb" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " id tokens \\\n", + "0 0 [EU, rejects, German, call, to, boycott, Briti... \n", + "1 1 [Peter, Blackburn] \n", + "2 2 [BRUSSELS, 1996-08-22] \n", + "3 3 [The, European, Commission, said, on, Thursday... \n", + "4 4 [Germany, 's, representative, to, the, Europea... \n", + "\n", + " pos_tags \\\n", + "0 [22, 42, 16, 21, 35, 37, 16, 21, 7] \n", + "1 [22, 22] \n", + "2 [22, 11] \n", + "3 [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3... \n", + "4 [22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2... \n", + "\n", + " chunk_tags \\\n", + "0 [11, 21, 11, 12, 21, 22, 11, 12, 0] \n", + "1 [11, 12] \n", + "2 [11, 12] \n", + "3 [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1... \n", + "4 [11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1... \n", + "\n", + " ner_tags \n", + "0 [3, 0, 7, 0, 0, 0, 7, 0, 0] \n", + "1 [1, 2] \n", + "2 [5, 0] \n", + "3 [0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ... \n", + "4 [5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ... \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.describe())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LAiHg17QhO-2", + "outputId": "065d13c3-c8ea-40f3-f84a-2fc4f2332ff2" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " id tokens pos_tags chunk_tags ner_tags\n", + "count 20744 20744 20744 20744 20744\n", + "unique 14041 18731 13126 11282 8047\n", + "top 0 [Scorers, :] [22, 11] [11, 12] [5, 0]\n", + "freq 3 30 611 1290 955\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.dtypes)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9LwwJ8zbhVlk", + "outputId": "c32dde53-bf78-4f94-aa32-1cdef388099e" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "id object\n", + "tokens object\n", + "pos_tags object\n", + "chunk_tags object\n", + "ner_tags object\n", + "dtype: object\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.isnull().sum())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "njbG34F3hl5D", + "outputId": "81cb8929-f9a0-4a07-d1f6-f306d2ffc7c0" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "id 0\n", + "tokens 0\n", + "pos_tags 0\n", + "chunk_tags 0\n", + "ner_tags 0\n", + "dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "label_counts = conll_df['ner_tags'].value_counts()\n", + "print(label_counts)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "98pX56RShpgR", + "outputId": "18ef0b75-727b-4f2b-8ae9-6b4415e8e17a" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[5, 0] 955\n", + "[3, 0, 0, 0, 0, 0, 0, 0] 663\n", + "[0, 1, 2, 0, 5, 0, 0] 582\n", + "[0, 0] 409\n", + "[3, 0, 3, 0] 352\n", + " ... \n", + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] 1\n", + "[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1\n", + "[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1\n", + "[0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 2, 0] 1\n", + "[0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 0, 0, 0, 0, 0, 0, 1, 0] 1\n", + "Name: ner_tags, Length: 8047, dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "top_10_labels = label_counts.head(10)\n", + "\n", + "# Plot the distribution of the top 10 NER tags\n", + "plt.figure(figsize=(10, 6))\n", + "top_10_labels.plot(kind='bar')\n", + "plt.title('Top 10 Most Common NER Tags')\n", + "plt.xlabel('NER Tag')\n", + "plt.ylabel('Count')\n", + "plt.show()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 710 + }, + "id": "Yd71HpRQhuoZ", + "outputId": "066c4b14-3edf-4139-e665-cdbf95dac172" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "def export_to_file(export_file_path, data):\n", + " with open(export_file_path, \"w\") as f:\n", + " for record in data:\n", + " ner_tags = record[\"ner_tags\"]\n", + " tokens = record[\"tokens\"]\n", + " if len(tokens) > 0:\n", + " f.write(\n", + " str(len(tokens))\n", + " + \"\\t\"\n", + " + \"\\t\".join(tokens)\n", + " + \"\\t\"\n", + " + \"\\t\".join(map(str, ner_tags))\n", + " + \"\\n\"\n", + " )\n", + "\n", + "\n", + "os.makedirs(\"data\", exist_ok=True)\n", + "export_to_file(\"./data/conll_train.txt\", conll_data[\"train\"])\n", + "export_to_file(\"./data/conll_val.txt\", conll_data[\"validation\"])" + ], + "metadata": { + "id": "EQgmkV1fZRhI" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def make_tag_lookup_table():\n", + " iob_labels = [\"B\", \"I\"]\n", + " ner_labels = [\"PER\", \"ORG\", \"LOC\", \"MISC\"]\n", + " all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]\n", + " all_labels = [\"-\".join([a, b]) for a, b in all_labels]\n", + " all_labels = [\"[PAD]\", \"O\"] + all_labels\n", + " return dict(zip(range(0, len(all_labels) + 1), all_labels))\n", + "\n", + "\n", + "mapping = make_tag_lookup_table()\n", + "print(mapping)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OdufhIrEZRs2", + "outputId": "09e10fc1-6fdf-4281-ac81-973d32dad3a5" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "all_tokens = sum(conll_data[\"train\"][\"tokens\"], [])\n", + "all_tokens_array = np.array(list(map(str.lower, all_tokens)))\n", + "\n", + "counter = Counter(all_tokens_array)\n", + "print(len(counter))\n", + "\n", + "num_tags = len(mapping)\n", + "vocab_size = 20000\n", + "\n", + "# We only take (vocab_size - 2) most commons words from the training data since\n", + "# the `StringLookup` class uses 2 additional tokens - one denoting an unknown\n", + "# token and another one denoting a masking token\n", + "vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]\n", + "\n", + "# The StringLook class will convert tokens to token IDs\n", + "lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a7T9RCZ3ZSKB", + "outputId": "c2dae2fc-b812-4d64-b3eb-23e2d38710c3" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "21009\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "train_data = tf.data.TextLineDataset(\"./data/conll_train.txt\")\n", + "val_data = tf.data.TextLineDataset(\"./data/conll_val.txt\")" + ], + "metadata": { + "id": "vdcDo5IJZfjl" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(list(train_data.take(1).as_numpy_iterator()))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8fXqLG3FZfmx", + "outputId": "42354174-a397-4b9e-eda0-4b1d5ed62665" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[b'9\\tEU\\trejects\\tGerman\\tcall\\tto\\tboycott\\tBritish\\tlamb\\t.\\t3\\t0\\t7\\t0\\t0\\t0\\t7\\t0\\t0']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def map_record_to_training_data(record):\n", + " record = tf.strings.split(record, sep=\"\\t\")\n", + " length = tf.strings.to_number(record[0], out_type=tf.int32)\n", + " tokens = record[1 : length + 1]\n", + " tags = record[length + 1 :]\n", + " tags = tf.strings.to_number(tags, out_type=tf.int64)\n", + " tags += 1\n", + " return tokens, tags\n", + "\n", + "\n", + "def lowercase_and_convert_to_ids(tokens):\n", + " tokens = tf.strings.lower(tokens)\n", + " return lookup_layer(tokens)\n", + "\n", + "\n", + "# We use `padded_batch` here because each record in the dataset has a\n", + "# different length.\n", + "batch_size = 32\n", + "train_dataset = (\n", + " train_data.map(map_record_to_training_data)\n", + " .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))\n", + " .padded_batch(batch_size)\n", + ")\n", + "val_dataset = (\n", + " val_data.map(map_record_to_training_data)\n", + " .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))\n", + " .padded_batch(batch_size)\n", + ")\n", + "\n", + "ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)" + ], + "metadata": { + "id": "jtt-G6ezZto5" + }, + "execution_count": 14, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class CustomNonPaddingTokenLoss(keras.losses.Loss):\n", + " def __init__(self, name=\"custom_ner_loss\"):\n", + " super().__init__(name=name)\n", + "\n", + " def call(self, y_true, y_pred):\n", + " loss_fn = keras.losses.SparseCategoricalCrossentropy(\n", + " from_logits=False, reduction= 'none'\n", + " )\n", + " loss = loss_fn(y_true, y_pred)\n", + " mask = tf.cast((y_true > 0), dtype=tf.float32)\n", + " loss = loss * mask\n", + " return tf.reduce_sum(loss) / tf.reduce_sum(mask)\n", + "\n", + "\n", + "loss = CustomNonPaddingTokenLoss()" + ], + "metadata": { + "id": "uqCmpwqgZtrs" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ner_model.compile(optimizer=\"adam\", loss=loss)\n", + "ner_model.fit(train_dataset, epochs=10)\n", + "\n", + "\n", + "def tokenize_and_convert_to_ids(text):\n", + " tokens = text.split()\n", + " return lowercase_and_convert_to_ids(tokens)\n", + "\n", + "\n", + "# Sample inference using the trained model\n", + "sample_input = tokenize_and_convert_to_ids(\n", + " \"eu rejects german call to boycott british lamb\"\n", + ")\n", + "sample_input = tf.reshape(sample_input, shape=[1, -1])\n", + "print(sample_input)\n", + "\n", + "output = ner_model.predict(sample_input)\n", + "prediction = np.argmax(output, axis=-1)[0]\n", + "prediction = [mapping[i] for i in prediction]\n", + "\n", + "# eu -> B-ORG, german -> B-MISC, british -> B-MISC\n", + "print(prediction)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TQDGyN4gZtuC", + "outputId": "5b743bb3-2112-47b2-e4f7-0db45991f93d" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/10\n", + "439/439 [==============================] - 20s 38ms/step - loss: 0.6150\n", + "Epoch 2/10\n", + "439/439 [==============================] - 17s 38ms/step - loss: 0.2667\n", + "Epoch 3/10\n", + "439/439 [==============================] - 14s 33ms/step - loss: 0.1617\n", + "Epoch 4/10\n", + "439/439 [==============================] - 15s 33ms/step - loss: 0.1254\n", + "Epoch 5/10\n", + "439/439 [==============================] - 14s 33ms/step - loss: 0.1015\n", + "Epoch 6/10\n", + "439/439 [==============================] - 14s 32ms/step - loss: 0.0837\n", + "Epoch 7/10\n", + "439/439 [==============================] - 15s 35ms/step - loss: 0.0697\n", + "Epoch 8/10\n", + "439/439 [==============================] - 14s 32ms/step - loss: 0.0604\n", + "Epoch 9/10\n", + "439/439 [==============================] - 15s 33ms/step - loss: 0.0526\n", + "Epoch 10/10\n", + "439/439 [==============================] - 16s 35ms/step - loss: 0.0456\n", + "tf.Tensor([[ 988 10950 204 628 6 3938 215 5773]], shape=(1, 8), dtype=int64)\n", + "1/1 [==============================] - 0s 261ms/step\n", + "['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def calculate_metrics(dataset):\n", + " all_true_tag_ids, all_predicted_tag_ids = [], []\n", + "\n", + " for x, y in dataset:\n", + " output = ner_model.predict(x, verbose=0)\n", + " predictions = np.argmax(output, axis=-1)\n", + " predictions = np.reshape(predictions, [-1])\n", + "\n", + " true_tag_ids = np.reshape(y, [-1])\n", + "\n", + " mask = (true_tag_ids > 0) & (predictions > 0)\n", + " true_tag_ids = true_tag_ids[mask]\n", + " predicted_tag_ids = predictions[mask]\n", + "\n", + " all_true_tag_ids.append(true_tag_ids)\n", + " all_predicted_tag_ids.append(predicted_tag_ids)\n", + "\n", + " all_true_tag_ids = np.concatenate(all_true_tag_ids)\n", + " all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)\n", + "\n", + " predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]\n", + " real_tags = [mapping[tag] for tag in all_true_tag_ids]\n", + "\n", + " evaluate(real_tags, predicted_tags)\n", + "\n", + "\n", + "calculate_metrics(val_dataset)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vPPszQFIcEKi", + "outputId": "22d8a103-b1d1-402b-b401-f5662fdaca00" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "processed 51362 tokens with 5942 phrases; found: 5194 phrases; correct: 3847.\n", + "accuracy: 62.20%; (non-O)\n", + "accuracy: 93.33%; precision: 74.07%; recall: 64.74%; FB1: 69.09\n", + " LOC: precision: 85.18%; recall: 79.48%; FB1: 82.23 1714\n", + " MISC: precision: 75.61%; recall: 63.88%; FB1: 69.25 779\n", + " ORG: precision: 63.88%; recall: 60.92%; FB1: 62.37 1279\n", + " PER: precision: 68.99%; recall: 53.26%; FB1: 60.11 1422\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def test_model_with_input(ner_model, mapping):\n", + " # Get input sentence from user\n", + " input_sentence = input(\"Enter a sentence: \")\n", + "\n", + " # Tokenize and convert input sentence to IDs\n", + " sample_input = tokenize_and_convert_to_ids(input_sentence)\n", + " sample_input = tf.reshape(sample_input, shape=[1, -1])\n", + "\n", + " # Predict tags using the trained model\n", + " output = ner_model.predict(sample_input)\n", + " predictions = np.argmax(output, axis=-1)[0]\n", + " predicted_tags = [mapping[i] for i in predictions]\n", + "\n", + " # Print the predicted tags for each token in the input sentence\n", + " print(\"Input sentence:\", input_sentence)\n", + " print(\"Predicted tags:\", predicted_tags)\n", + "\n", + "# Test the model with user input\n", + "test_model_with_input(ner_model, mapping)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BX6jui33cEiJ", + "outputId": "91207f20-c00e-46ab-ae91-9bc1dfc8d804" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Enter a sentence: My Name is Karishma. I live in Canada. Canada I am from India\n", + "1/1 [==============================] - 0s 20ms/step\n", + "Input sentence: My Name is Karishma. I live in Canada. Canada I am from India\n", + "Predicted tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOC']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "logger = logging.getLogger(\"presidio-analyzer\")\n", + "\n", + "\n", + "class FlairRecognizer(EntityRecognizer):\n", + " \"\"\"\n", + " Wrapper for a flair model, if needed to be used within Presidio Analyzer.\n", + " :example:\n", + " >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry\n", + " >flair_recognizer = FlairRecognizer()\n", + " >registry = RecognizerRegistry()\n", + " >registry.add_recognizer(flair_recognizer)\n", + " >analyzer = AnalyzerEngine(registry=registry)\n", + " >results = analyzer.analyze(\n", + " > \"My name is Christopher and I live in Irbid.\",\n", + " > language=\"en\",\n", + " > return_decision_process=True,\n", + " >)\n", + " >for result in results:\n", + " > print(result)\n", + " > print(result.analysis_explanation)\n", + " \"\"\"\n", + "\n", + " ENTITIES = [\n", + " \"LOCATION\",\n", + " \"PERSON\",\n", + " \"ORGANIZATION\",\n", + " # \"MISCELLANEOUS\" # - There are no direct correlation with Presidio entities.\n", + " ]\n", + "\n", + " DEFAULT_EXPLANATION = \"Identified as {} by Flair's Named Entity Recognition\"\n", + "\n", + " CHECK_LABEL_GROUPS = [\n", + " ({\"LOCATION\"}, {\"LOC\", \"LOCATION\"}),\n", + " ({\"PERSON\"}, {\"PER\", \"PERSON\"}),\n", + " ({\"ORGANIZATION\"}, {\"ORG\"}),\n", + " # ({\"MISCELLANEOUS\"}, {\"MISC\"}), # Probably not PII\n", + " ]\n", + "\n", + " MODEL_LANGUAGES = {\"en\": \"flair/ner-english-large\"}\n", + "\n", + " PRESIDIO_EQUIVALENCES = {\n", + " \"PER\": \"PERSON\",\n", + " \"LOC\": \"LOCATION\",\n", + " \"ORG\": \"ORGANIZATION\",\n", + " # 'MISC': 'MISCELLANEOUS' # - Probably not PII\n", + " }\n", + "\n", + " def __init__(\n", + " self,\n", + " supported_language: str = \"en\",\n", + " supported_entities: Optional[List[str]] = None,\n", + " check_label_groups: Optional[Tuple[Set, Set]] = None,\n", + " model: SequenceTagger = None,\n", + " model_path: Optional[str] = None,\n", + " ):\n", + " self.check_label_groups = (\n", + " check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS\n", + " )\n", + "\n", + " supported_entities = supported_entities if supported_entities else self.ENTITIES\n", + "\n", + " if model and model_path:\n", + " raise ValueError(\"Only one of model or model_path should be provided.\")\n", + " elif model and not model_path:\n", + " self.model = model\n", + " elif not model and model_path:\n", + " print(f\"Loading model from {model_path}\")\n", + " self.model = SequenceTagger.load(model_path)\n", + " else:\n", + " print(f\"Loading model for language {supported_language}\")\n", + " self.model = SequenceTagger.load(\n", + " self.MODEL_LANGUAGES.get(supported_language)\n", + " )\n", + "\n", + " super().__init__(\n", + " supported_entities=supported_entities,\n", + " supported_language=supported_language,\n", + " name=\"Flair Analytics\",\n", + " )\n", + "\n", + " def load(self) -> None:\n", + " \"\"\"Load the model, not used. Model is loaded during initialization.\"\"\"\n", + " pass\n", + "\n", + " def get_supported_entities(self) -> List[str]:\n", + " \"\"\"\n", + " Return supported entities by this model.\n", + " :return: List of the supported entities.\n", + " \"\"\"\n", + " return self.supported_entities\n", + "\n", + " # Class to use Flair with Presidio as an external recognizer.\n", + " def analyze(\n", + " self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None\n", + " ) -> List[RecognizerResult]:\n", + " \"\"\"\n", + " Analyze text using Text Analytics.\n", + " :param text: The text for analysis.\n", + " :param entities: Not working properly for this recognizer.\n", + " :param nlp_artifacts: Not used by this recognizer.\n", + " :param language: Text language. Supported languages in MODEL_LANGUAGES\n", + " :return: The list of Presidio RecognizerResult constructed from the recognized\n", + " Flair detections.\n", + " \"\"\"\n", + "\n", + " results = []\n", + "\n", + " sentences = Sentence(text)\n", + " self.model.predict(sentences)\n", + "\n", + " # If there are no specific list of entities, we will look for all of it.\n", + " if not entities:\n", + " entities = self.supported_entities\n", + "\n", + " for entity in entities:\n", + " if entity not in self.supported_entities:\n", + " continue\n", + "\n", + " for ent in sentences.get_spans(\"ner\"):\n", + " if not self.__check_label(\n", + " entity, ent.labels[0].value, self.check_label_groups\n", + " ):\n", + " continue\n", + " textual_explanation = self.DEFAULT_EXPLANATION.format(\n", + " ent.labels[0].value\n", + " )\n", + " explanation = self.build_flair_explanation(\n", + " round(ent.score, 2), textual_explanation\n", + " )\n", + " flair_result = self._convert_to_recognizer_result(ent, explanation)\n", + "\n", + " results.append(flair_result)\n", + "\n", + " return results\n", + "\n", + " def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:\n", + " entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)\n", + " flair_score = round(entity.score, 2)\n", + "\n", + " flair_results = RecognizerResult(\n", + " entity_type=entity_type,\n", + " start=entity.start_position,\n", + " end=entity.end_position,\n", + " score=flair_score,\n", + " analysis_explanation=explanation,\n", + " )\n", + "\n", + " return flair_results\n", + "\n", + " def build_flair_explanation(\n", + " self, original_score: float, explanation: str\n", + " ) -> AnalysisExplanation:\n", + " \"\"\"\n", + " Create explanation for why this result was detected.\n", + " :param original_score: Score given by this recognizer\n", + " :param explanation: Explanation string\n", + " :return:\n", + " \"\"\"\n", + " explanation = AnalysisExplanation(\n", + " recognizer=self.__class__.__name__,\n", + " original_score=original_score,\n", + " textual_explanation=explanation,\n", + " )\n", + " return explanation\n", + "\n", + " @staticmethod\n", + " def __check_label(\n", + " entity: str, label: str, check_label_groups: Tuple[Set, Set]\n", + " ) -> bool:\n", + " return any(\n", + " [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]\n", + " )" + ], + "metadata": { + "id": "OWwGi143lCVF" + }, + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers import AutoModel, AutoTokenizer\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " from flair.data import Sentence\n", + " from flair.models import SequenceTagger\n", + "\n", + " # load tagger\n", + " tagger = SequenceTagger.load(\"flair/ner-english-large\")\n", + "\n", + " # make example sentence\n", + " sentence = Sentence(\"My name is Karishma Shirsath. I live in Toronto Canada.\")\n", + "\n", + " # predict NER tags\n", + " tagger.predict(sentence)\n", + "\n", + " # print sentence\n", + " print(sentence)\n", + "\n", + " # print predicted NER spans\n", + " print(\"The following NER tags are found:\")\n", + " # iterate over entities and print\n", + " for entity in sentence.get_spans(\"ner\"):\n", + " print(entity)\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LT92Kk44lgAV", + "outputId": "0fc28bdc-4a3a-4e68-8617-27cdcedbc3ce" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2024-03-16 05:24:49,993 SequenceTagger predicts: Dictionary with 20 tags: , O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, , \n", + "Sentence[12]: \"My name is Karishma Shirsath. I live in Toronto Canada.\" → [\"Karishma Shirsath\"/PER, \"Toronto\"/LOC, \"Canada\"/LOC]\n", + "The following NER tags are found:\n", + "Span[3:5]: \"Karishma Shirsath\" → PER (1.0)\n", + "Span[9:10]: \"Toronto\" → LOC (1.0)\n", + "Span[10:11]: \"Canada\" → LOC (1.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "if __name__ == \"__main__\":\n", + " from flair.data import Sentence\n", + " from flair.models import SequenceTagger\n", + "\n", + " # load tagger\n", + " tagger = SequenceTagger.load(\"flair/ner-english-large\")\n", + "\n", + " # make example sentence\n", + " sentence = Sentence(\"My name is Karishma Shirsath. I live in Toronto Canada.\")\n", + "\n", + " # predict NER tags\n", + " tagger.predict(sentence)\n", + "\n", + " # print sentence\n", + " print(sentence)\n", + "\n", + " # Anonymize identified named entities\n", + " anonymized_sentence = str(sentence)\n", + " for entity in sentence.get_spans(\"ner\"):\n", + " entity_text = entity.text\n", + " anonymized_text = \"*\" * len(entity_text)\n", + " anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)\n", + "\n", + " # print anonymized sentence\n", + " print(\"Anonymized sentence:\")\n", + " print(anonymized_sentence)\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lgYJJVilwbVF", + "outputId": "20e52cfd-0e6e-4906-bcb0-3c403160293d" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2024-03-16 05:39:00,757 SequenceTagger predicts: Dictionary with 20 tags: , O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, , \n", + "Sentence[12]: \"My name is Karishma Shirsath. I live in Toronto Canada.\" → [\"Karishma Shirsath\"/PER, \"Toronto\"/LOC, \"Canada\"/LOC]\n", + "Anonymized sentence:\n", + "Sentence[12]: \"My name is *****************. I live in ******* ******.\" → [\"*****************\"/PER, \"*******\"/LOC, \"******\"/LOC]\n" + ] + } + ] + } + ] +} \ No newline at end of file