diff --git "a/Final file.ipynb" "b/Final file.ipynb" new file mode 100644--- /dev/null +++ "b/Final file.ipynb" @@ -0,0 +1,1452 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gczeIWL7Yqml", + "outputId": "353b3804-fb2b-4ead-d190-69cc7ef11ea6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting datasets\n", + " Downloading datasets-2.18.0-py3-none-any.whl (510 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/510.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.4/510.5 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m501.8/510.5 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.13.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.25.2)\n", + "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (14.0.2)\n", + "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n", + "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", + " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.2)\n", + "Collecting xxhash (from datasets)\n", + " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting multiprocess (from datasets)\n", + " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.2.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.3)\n", + "Requirement already satisfied: huggingface-hub>=0.19.4 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.20.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.4->datasets) (4.10.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2024.2.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n", + "Installing collected packages: xxhash, dill, multiprocess, datasets\n", + "Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16 xxhash-3.4.1\n", + "--2024-03-16 03:52:00-- https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 7502 (7.3K) [text/plain]\n", + "Saving to: ‘conlleval.py’\n", + "\n", + "conlleval.py 100%[===================>] 7.33K --.-KB/s in 0s \n", + "\n", + "2024-03-16 03:52:00 (96.5 MB/s) - ‘conlleval.py’ saved [7502/7502]\n", + "\n" + ] + } + ], + "source": [ + "!pip3 install datasets\n", + "!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py\n" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install presidio-analyzer" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "x5eLSkVqlhh6", + "outputId": "9cf46693-5e60-425d-8693-22a5df24fea0" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting presidio-analyzer\n", + " Downloading presidio_analyzer-2.2.353-py3-none-any.whl (85 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.7/85.7 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<4.0.0,>=3.4.4 in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer) (3.7.4)\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer) (2023.12.25)\n", + "Collecting tldextract (from presidio-analyzer)\n", + " Downloading tldextract-5.1.1-py3-none-any.whl (97 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.7/97.7 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer) (6.0.1)\n", + "Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)\n", + " Downloading phonenumbers-8.13.32-py2.py3-none-any.whl (2.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (8.2.3)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.1.2)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.0.10)\n", + "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.3.4)\n", + "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.9.0)\n", + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (6.4.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (4.66.2)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.31.0)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.6.4)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.1.3)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (67.7.2)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (24.0)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.3.0)\n", + "Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.4.4->presidio-analyzer) (1.25.2)\n", + "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from tldextract->presidio-analyzer) (3.6)\n", + "Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)\n", + " Downloading requests_file-2.0.0-py2.py3-none-any.whl (4.2 kB)\n", + "Requirement already satisfied: filelock>=3.0.8 in /usr/local/lib/python3.10/dist-packages (from tldextract->presidio-analyzer) (3.13.1)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.3 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.16.3)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<4.0.0,>=3.4.4->presidio-analyzer) (4.10.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (3.3.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2024.2.2)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.1.4)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.10/dist-packages (from typer<0.10.0,>=0.3.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (8.1.7)\n", + "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from weasel<0.4.0,>=0.1.0->spacy<4.0.0,>=3.4.4->presidio-analyzer) (0.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->spacy<4.0.0,>=3.4.4->presidio-analyzer) (2.1.5)\n", + "Installing collected packages: phonenumbers, requests-file, tldextract, presidio-analyzer\n", + "Successfully installed phonenumbers-8.13.32 presidio-analyzer-2.2.353 requests-file-2.0.0 tldextract-5.1.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install flair" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "XWdmM-gGmHV-", + "outputId": "42e0e840-89df-4a99-a3d7-8d78fc7beff0" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting flair\n", + " Downloading flair-0.13.1-py3-none-any.whl (388 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m388.3/388.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting boto3>=1.20.27 (from flair)\n", + " Downloading boto3-1.34.64-py3-none-any.whl (139 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.3/139.3 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting bpemb>=0.3.2 (from flair)\n", + " Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)\n", + "Collecting conllu>=4.0 (from flair)\n", + " Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)\n", + "Collecting deprecated>=1.2.13 (from flair)\n", + " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", + "Collecting ftfy>=6.1.0 (from flair)\n", + " Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.4/54.4 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: gdown>=4.4.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.7.3)\n", + "Requirement already satisfied: gensim>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.3.2)\n", + "Requirement already satisfied: huggingface-hub>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from flair) (0.20.3)\n", + "Collecting janome>=0.4.2 (from flair)\n", + " Downloading Janome-0.5.0-py2.py3-none-any.whl (19.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.7/19.7 MB\u001b[0m \u001b[31m46.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langdetect>=1.0.9 (from flair)\n", + " Downloading langdetect-1.0.9.tar.gz (981 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m67.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: lxml>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.9.4)\n", + "Requirement already satisfied: matplotlib>=2.2.3 in /usr/local/lib/python3.10/dist-packages (from flair) (3.7.1)\n", + "Requirement already satisfied: more-itertools>=8.13.0 in /usr/local/lib/python3.10/dist-packages (from flair) (10.1.0)\n", + "Collecting mpld3>=0.3 (from flair)\n", + " Downloading mpld3-0.5.10-py3-none-any.whl (202 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m202.6/202.6 kB\u001b[0m \u001b[31m23.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pptree>=3.1 (from flair)\n", + " Downloading pptree-3.1.tar.gz (3.0 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from flair) (2.8.2)\n", + "Collecting pytorch-revgrad>=0.2.0 (from flair)\n", + " Downloading pytorch_revgrad-0.2.0-py3-none-any.whl (4.6 kB)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from flair) (2023.12.25)\n", + "Requirement already satisfied: scikit-learn>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from flair) (1.2.2)\n", + "Collecting segtok>=1.5.11 (from flair)\n", + " Downloading segtok-1.5.11-py3-none-any.whl (24 kB)\n", + "Collecting sqlitedict>=2.0.0 (from flair)\n", + " Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: tabulate>=0.8.10 in /usr/local/lib/python3.10/dist-packages (from flair) (0.9.0)\n", + "Requirement already satisfied: torch!=1.8,>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from flair) (2.2.1+cu121)\n", + "Requirement already satisfied: tqdm>=4.63.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.66.2)\n", + "Collecting transformer-smaller-training-vocab>=0.2.3 (from flair)\n", + " Downloading transformer_smaller_training_vocab-0.3.3-py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: transformers[sentencepiece]<5.0.0,>=4.18.0 in /usr/local/lib/python3.10/dist-packages (from flair) (4.38.2)\n", + "Collecting urllib3<2.0.0,>=1.0.0 (from flair)\n", + " Downloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.8/143.8 kB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting wikipedia-api>=0.5.7 (from flair)\n", + " Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)\n", + "Collecting semver<4.0.0,>=3.0.0 (from flair)\n", + " Downloading semver-3.0.2-py3-none-any.whl (17 kB)\n", + "Collecting botocore<1.35.0,>=1.34.64 (from boto3>=1.20.27->flair)\n", + " Downloading botocore-1.34.64-py3-none-any.whl (12.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.0/12.0 MB\u001b[0m \u001b[31m68.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.20.27->flair)\n", + " Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n", + "Collecting s3transfer<0.11.0,>=0.10.0 (from boto3>=1.20.27->flair)\n", + " Downloading s3transfer-0.10.1-py3-none-any.whl (82 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.2/82.2 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from bpemb>=0.3.2->flair) (1.25.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from bpemb>=0.3.2->flair) (2.31.0)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from bpemb>=0.3.2->flair) (0.1.99)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated>=1.2.13->flair) (1.14.1)\n", + "Requirement already satisfied: wcwidth<0.3.0,>=0.2.12 in /usr/local/lib/python3.10/dist-packages (from ftfy>=6.1.0->flair) (0.2.13)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from gdown>=4.4.0->flair) (3.13.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from gdown>=4.4.0->flair) (1.16.0)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from gdown>=4.4.0->flair) (4.12.3)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim>=4.2.0->flair) (1.11.4)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim>=4.2.0->flair) (6.4.0)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (2023.6.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (6.0.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (4.10.0)\n", + "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.10.0->flair) (24.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (1.2.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (4.49.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (1.4.5)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.2.3->flair) (3.1.2)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from mpld3>=0.3->flair) (3.1.3)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.2->flair) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.2->flair) (3.3.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch!=1.8,>=1.5.0->flair) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch!=1.8,>=1.5.0->flair) (3.2.1)\n", + "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m45.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m72.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m906.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━��━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nccl-cu12==2.19.3 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105 (from torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: triton==2.2.0 in /usr/local/lib/python3.10/dist-packages (from torch!=1.8,>=1.5.0->flair) (2.2.0)\n", + "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch!=1.8,>=1.5.0->flair)\n", + " Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m70.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (0.15.2)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (0.4.2)\n", + "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (3.20.3)\n", + "Collecting accelerate>=0.21.0 (from transformers[sentencepiece]<5.0.0,>=4.18.0->flair)\n", + " Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.1/290.1 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->gdown>=4.4.0->flair) (2.5)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->mpld3>=0.3->flair) (2.1.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (3.6)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (2024.2.2)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from requests->bpemb>=0.3.2->flair) (1.7.1)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch!=1.8,>=1.5.0->flair) (1.3.0)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->transformers[sentencepiece]<5.0.0,>=4.18.0->flair) (5.9.5)\n", + "Building wheels for collected packages: langdetect, pptree, sqlitedict\n", + " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=9da87eaaff56d6d1421c337ae61089e29874a2de99038123b209c2cf6ffe4791\n", + " Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n", + " Building wheel for pptree (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pptree: filename=pptree-3.1-py3-none-any.whl size=4609 sha256=be019012224ff0981466d5ef57193c243fbbc1542c10b46f3ed8f17e84f74b0e\n", + " Stored in directory: /root/.cache/pip/wheels/9f/b6/0e/6f26eb9e6eb53ff2107a7888d72b5a6a597593956113037828\n", + " Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16862 sha256=056a323511a15e5bdbb990ad53b061cdb301623fdbf4a77ead3f71402b27bf97\n", + " Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n", + "Successfully built langdetect pptree sqlitedict\n", + "Installing collected packages: sqlitedict, pptree, janome, urllib3, semver, segtok, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, langdetect, jmespath, ftfy, deprecated, conllu, nvidia-cusparse-cu12, nvidia-cudnn-cu12, botocore, wikipedia-api, s3transfer, nvidia-cusolver-cu12, mpld3, bpemb, boto3, pytorch-revgrad, accelerate, transformer-smaller-training-vocab, flair\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 2.0.7\n", + " Uninstalling urllib3-2.0.7:\n", + " Successfully uninstalled urllib3-2.0.7\n", + "Successfully installed accelerate-0.28.0 boto3-1.34.64 botocore-1.34.64 bpemb-0.3.4 conllu-4.5.3 deprecated-1.2.14 flair-0.13.1 ftfy-6.2.0 janome-0.5.0 jmespath-1.0.1 langdetect-1.0.9 mpld3-0.5.10 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 pptree-3.1 pytorch-revgrad-0.2.0 s3transfer-0.10.1 segtok-1.5.11 semver-3.0.2 sqlitedict-2.1.0 transformer-smaller-training-vocab-0.3.3 urllib3-1.26.18 wikipedia-api-0.6.0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "urllib3" + ] + }, + "id": "dfdc4a89fa71429587bc109f13908415" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n", + "\n", + "import os\n", + "import keras\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from keras import layers\n", + "from datasets import load_dataset\n", + "from collections import Counter\n", + "from conlleval import evaluate\n", + "\n", + "import pandas as pd\n", + "from google.colab import files\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from transformers import AutoModel, AutoTokenizer\n", + "\n", + "import logging\n", + "from typing import Optional, List, Tuple, Set\n", + "from presidio_analyzer import (\n", + " RecognizerResult,\n", + " EntityRecognizer,\n", + " AnalysisExplanation,\n", + ")\n", + "from presidio_analyzer.nlp_engine import NlpArtifacts\n", + "\n", + "from flair.data import Sentence\n", + "from flair.models import SequenceTagger\n" + ], + "metadata": { + "id": "9FxNt5pZY0e2" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class TransformerBlock(layers.Layer):\n", + " def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):\n", + " super().__init__()\n", + " self.att = keras.layers.MultiHeadAttention(\n", + " num_heads=num_heads, key_dim=embed_dim\n", + " )\n", + " self.ffn = keras.Sequential(\n", + " [\n", + " keras.layers.Dense(ff_dim, activation=\"relu\"),\n", + " keras.layers.Dense(embed_dim),\n", + " ]\n", + " )\n", + " self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)\n", + " self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)\n", + " self.dropout1 = keras.layers.Dropout(rate)\n", + " self.dropout2 = keras.layers.Dropout(rate)\n", + "\n", + " def call(self, inputs, training=False):\n", + " attn_output = self.att(inputs, inputs)\n", + " attn_output = self.dropout1(attn_output, training=training)\n", + " out1 = self.layernorm1(inputs + attn_output)\n", + " ffn_output = self.ffn(out1)\n", + " ffn_output = self.dropout2(ffn_output, training=training)\n", + " return self.layernorm2(out1 + ffn_output)\n" + ], + "metadata": { + "id": "a2ro_nntY-FC" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class TokenAndPositionEmbedding(layers.Layer):\n", + " def __init__(self, maxlen, vocab_size, embed_dim):\n", + " super().__init__()\n", + " self.token_emb = keras.layers.Embedding(\n", + " input_dim=vocab_size, output_dim=embed_dim\n", + " )\n", + " self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)\n", + "\n", + " def call(self, inputs):\n", + " maxlen = tf.shape(inputs)[-1]\n", + " positions = tf.range(start=0, limit=maxlen, delta=1)\n", + " position_embeddings = self.pos_emb(positions)\n", + " token_embeddings = self.token_emb(inputs)\n", + " return token_embeddings + position_embeddings" + ], + "metadata": { + "id": "jg0WkejPZBn8" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class NERModel(keras.Model):\n", + " def __init__(\n", + " self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32\n", + " ):\n", + " super().__init__()\n", + " self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)\n", + " self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)\n", + " self.dropout1 = layers.Dropout(0.1)\n", + " self.ff = layers.Dense(ff_dim, activation=\"relu\")\n", + " self.dropout2 = layers.Dropout(0.1)\n", + " self.ff_final = layers.Dense(num_tags, activation=\"softmax\")\n", + "\n", + " def call(self, inputs, training=False):\n", + " x = self.embedding_layer(inputs)\n", + " x = self.transformer_block(x)\n", + " x = self.dropout1(x, training=training)\n", + " x = self.ff(x)\n", + " x = self.dropout2(x, training=training)\n", + " x = self.ff_final(x)\n", + " return x" + ], + "metadata": { + "id": "HeMvk_zKZFXy" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "conll_data = load_dataset(\"conll2003\")\n" + ], + "metadata": { + "id": "weGmhigxZMaT" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def dataset_to_dataframe(dataset):\n", + " data_dict = {key: dataset[key] for key in dataset.features}\n", + " return pd.DataFrame(data_dict)\n", + "\n", + "# Combine all splits (train, validation, test) into a single DataFrame\n", + "conll_df = pd.concat([dataset_to_dataframe(conll_data[split]) for split in conll_data.keys()])" + ], + "metadata": { + "id": "SEvvIFAgdcAF" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "csv_file_path = \"conll_data.csv\"\n", + "conll_df.to_csv(csv_file_path, index=False)\n", + "\n", + "# Download the CSV file to local machine\n", + "\n", + "files.download(csv_file_path)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 17 + }, + "id": "UejgBp-Ng_l_", + "outputId": "98b45e90-3b08-4857-f7eb-42e9a319eb29" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " async function download(id, filename, size) {\n", + " if (!google.colab.kernel.accessAllowed) {\n", + " return;\n", + " }\n", + " const div = document.createElement('div');\n", + " const label = document.createElement('label');\n", + " label.textContent = `Downloading \"${filename}\": `;\n", + " div.appendChild(label);\n", + " const progress = document.createElement('progress');\n", + " progress.max = size;\n", + " div.appendChild(progress);\n", + " document.body.appendChild(div);\n", + "\n", + " const buffers = [];\n", + " let downloaded = 0;\n", + "\n", + " const channel = await google.colab.kernel.comms.open(id);\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + "\n", + " for await (const message of channel.messages) {\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + " if (message.buffers) {\n", + " for (const buffer of message.buffers) {\n", + " buffers.push(buffer);\n", + " downloaded += buffer.byteLength;\n", + " progress.value = downloaded;\n", + " }\n", + " }\n", + " }\n", + " const blob = new Blob(buffers, {type: 'application/binary'});\n", + " const a = document.createElement('a');\n", + " a.href = window.URL.createObjectURL(blob);\n", + " a.download = filename;\n", + " div.appendChild(a);\n", + " a.click();\n", + " div.remove();\n", + " }\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "download(\"download_e9dfb994-0d94-46a0-a16d-296a01070e4a\", \"conll_data.csv\", 6111395)" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.head())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NRyX6MExgwi7", + "outputId": "78702706-75b5-4d4e-9cf7-08f21bb99dcb" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " id tokens \\\n", + "0 0 [EU, rejects, German, call, to, boycott, Briti... \n", + "1 1 [Peter, Blackburn] \n", + "2 2 [BRUSSELS, 1996-08-22] \n", + "3 3 [The, European, Commission, said, on, Thursday... \n", + "4 4 [Germany, 's, representative, to, the, Europea... \n", + "\n", + " pos_tags \\\n", + "0 [22, 42, 16, 21, 35, 37, 16, 21, 7] \n", + "1 [22, 22] \n", + "2 [22, 11] \n", + "3 [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3... \n", + "4 [22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2... \n", + "\n", + " chunk_tags \\\n", + "0 [11, 21, 11, 12, 21, 22, 11, 12, 0] \n", + "1 [11, 12] \n", + "2 [11, 12] \n", + "3 [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1... \n", + "4 [11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1... \n", + "\n", + " ner_tags \n", + "0 [3, 0, 7, 0, 0, 0, 7, 0, 0] \n", + "1 [1, 2] \n", + "2 [5, 0] \n", + "3 [0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ... \n", + "4 [5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ... \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.describe())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LAiHg17QhO-2", + "outputId": "065d13c3-c8ea-40f3-f84a-2fc4f2332ff2" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " id tokens pos_tags chunk_tags ner_tags\n", + "count 20744 20744 20744 20744 20744\n", + "unique 14041 18731 13126 11282 8047\n", + "top 0 [Scorers, :] [22, 11] [11, 12] [5, 0]\n", + "freq 3 30 611 1290 955\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.dtypes)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9LwwJ8zbhVlk", + "outputId": "c32dde53-bf78-4f94-aa32-1cdef388099e" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "id object\n", + "tokens object\n", + "pos_tags object\n", + "chunk_tags object\n", + "ner_tags object\n", + "dtype: object\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(conll_df.isnull().sum())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "njbG34F3hl5D", + "outputId": "81cb8929-f9a0-4a07-d1f6-f306d2ffc7c0" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "id 0\n", + "tokens 0\n", + "pos_tags 0\n", + "chunk_tags 0\n", + "ner_tags 0\n", + "dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "label_counts = conll_df['ner_tags'].value_counts()\n", + "print(label_counts)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "98pX56RShpgR", + "outputId": "18ef0b75-727b-4f2b-8ae9-6b4415e8e17a" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[5, 0] 955\n", + "[3, 0, 0, 0, 0, 0, 0, 0] 663\n", + "[0, 1, 2, 0, 5, 0, 0] 582\n", + "[0, 0] 409\n", + "[3, 0, 3, 0] 352\n", + " ... \n", + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] 1\n", + "[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1\n", + "[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1\n", + "[0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 2, 0] 1\n", + "[0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 0, 0, 0, 0, 0, 0, 1, 0] 1\n", + "Name: ner_tags, Length: 8047, dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "top_10_labels = label_counts.head(10)\n", + "\n", + "# Plot the distribution of the top 10 NER tags\n", + "plt.figure(figsize=(10, 6))\n", + "top_10_labels.plot(kind='bar')\n", + "plt.title('Top 10 Most Common NER Tags')\n", + "plt.xlabel('NER Tag')\n", + "plt.ylabel('Count')\n", + "plt.show()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 710 + }, + "id": "Yd71HpRQhuoZ", + "outputId": "066c4b14-3edf-4139-e665-cdbf95dac172" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1sAAAK1CAYAAAA+BfRDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABf+0lEQVR4nO3deVxV1f7/8fcBlUEFnJjKiXIih0xTsa6RcsW0rna7eS3ralmUqTmmeW/OlWmaZVkOZdo3Lctvg0OChpqWKESaRmqDlKaBclEwSRnO/v3Rz/P1BBTnyOYMvJ6PB4+HZ6/FOZ+zXBvOm7332hbDMAwBAAAAACqVj6sLAAAAAABvRNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AKACLBZLhb62b99uei2vvPKK7rzzTjVp0kQWi0VDhw4tt++ZM2eUkJCgRo0aqXbt2rr55pv1xRdfVOh1YmNjZbFY1KJFizLbt2zZYnvfa9eudeat/KmPPvpI06dPd/j73n//fd1yyy1q2LChatWqpcjISA0cOFBbt26t/CK9yMX/89tuu61U2w8//CCLxaJ58+bZtm3fvv0P94e3337b1rdZs2Z2bbVr11aXLl30xhtv/GldQ4cOrdD+90f7AgC4Qg1XFwAAnuB//ud/7B6/8cYb2rJlS6ntbdq0Mb2WOXPm6OzZs+rSpYt+/vnncvtZrVb169dPX375pR577DE1bNhQL7/8smJjY5Wenl5uiLqUv7+/vvvuO6WmpqpLly52batWrZK/v7/Onz9/2e+pPB999JEWLVpU4cBlGIbuv/9+rVixQh07dtS4ceMUHh6un3/+We+//7569eqlzz77TN27dzetZm+wYcMGpaenq1OnThXq/+ijj+r6668vtT0mJsbu8bXXXqvx48dLkn7++We9+uqrGjJkiC5cuKAHH3yw3Od/6KGHFBcXZ3ucmZmpqVOnKiEhQX/5y19s26+66qoK1QsAVcYAADhsxIgRhqt+hP7www+G1Wo1DMMwateubQwZMqTMfmvWrDEkGe+++65t28mTJ42QkBDjrrvu+tPXuemmm4xrrrnGaNWqlTFmzBi7tl9//dUICgoy7rjjjlKvUZkcHednn33WkGSMGTPGNkaXeuONN4w9e/ZUZole5aabbjKaNGli1KtXz7jtttvs2jIzMw1JxrPPPmvbtm3btgr//zdt2tTo16+f3baTJ08aderUMdq0aeNQnWlpaYYk4/XXX3fo+wCgqnEaIQBUknPnzmn8+PFq3Lix/Pz81KpVK82bN0+GYdj1s1gsGjlypFatWqVWrVrJ399fnTp10o4dOyr0Ok2bNpXFYvnTfmvXrlVYWJj+/ve/27Y1atRIAwcO1IcffqgLFy5U6PXuuusurVmzRlar1bZt/fr1Kigo0MCBA8v8nr179+qWW25RUFCQ6tSpo169emn37t12fYqKijRjxgy1aNFC/v7+atCggW688UZt2bJF0m+nji1atEiS/Wmc5fn11181e/ZstW7dWvPmzSuz77333mt3hO7IkSO68847Vb9+fQUGBqpbt27auHGj3fdcPFXunXfe0YwZM3TFFVeobt26+sc//qG8vDxduHBBY8aMUWhoqOrUqaP77ruv1Nhe/D9/9913FR0drYCAAMXExOjAgQOSpCVLlujqq6+Wv7+/YmNj9cMPP5Sq/d1331WnTp0UEBCghg0b6p577tHx48ft+gwdOlR16tTR8ePHNWDAANWpU0eNGjXShAkTVFJSUu7YXapu3boaO3as1q9fX+FTTp3VqFEjtW7dWt9///1lP9f+/fs1dOhQRUVFyd/fX+Hh4br//vv13//+t1Tf7du3q3PnzvL399dVV12lJUuWaPr06aXmzJYtW3TjjTcqJCREderUUatWrfTvf//7smsFUH1wGiEAVALDMPS3v/1N27Zt07Bhw3TttdcqKSlJjz32mI4fP64FCxbY9f/kk0+0Zs0aPfroo/Lz89PLL7+sPn36KDU1VW3btq2Umvbu3avrrrtOPj72f1fr0qWLli5dqm+++Ubt2rX70+e5++67NX36dG3fvl09e/aUJK1evVq9evVSaGhoqf4ZGRn6y1/+oqCgIE2cOFE1a9bUkiVLFBsbq08++URdu3aVJE2fPl2zZ8/WAw88oC5duig/P1+ff/65vvjiC/31r3/VQw89pBMnTpR5umZZPv30U+Xm5mrMmDHy9fX90/7Z2dnq3r27CgoK9Oijj6pBgwZauXKl/va3v2nt2rW6/fbb7frPnj1bAQEBevzxx/Xdd9/pxRdfVM2aNeXj46PTp09r+vTp2r17t1asWKHmzZtr6tSpdt+/c+dOrVu3TiNGjLA936233qqJEyfq5Zdf1iOPPKLTp09r7ty5uv/+++2uL1uxYoXuu+8+XX/99Zo9e7ays7P1wgsv6LPPPtPevXsVEhJi61tSUqL4+Hh17dpV8+bN08cff6z58+frqquu0vDhw/90XCRp9OjRWrBggaZPn65169b9af+zZ88qJyen1PYGDRr8YUAuLi7WTz/9pHr16lWorj+yZcsWHTlyRPfdd5/Cw8OVkZGhpUuXKiMjQ7t377bVsXfvXvXp00cRERGaMWOGSkpKNHPmTDVq1Mju+TIyMnTrrbeqffv2mjlzpvz8/PTdd9/ps88+u+xaAVQjrj60BgCe6Pent33wwQeGJOPJJ5+06/ePf/zDsFgsxnfffWfbJsmQZHz++ee2bT/++KPh7+9v3H777Q7V8UenEdauXdu4//77S23fuHGjIclITEz8w+e+eBqhYRhG586djWHDhhmGYRinT582atWqZaxcubLM08gGDBhg1KpVy/j+++9t206cOGHUrVvX6NGjh21bhw4dSp1W9nuOnEb4wgsvGJKM999/v0L9x4wZY0gydu7cadt29uxZo3nz5kazZs2MkpISwzD+71S5tm3bGoWFhba+d911l2GxWIxbbrnF7nljYmKMpk2b2m2TZPj5+RmZmZm2bUuWLDEkGeHh4UZ+fr5t++TJkw1Jtr6FhYVGaGio0bZtW+PXX3+19duwYYMhyZg6dapt25AhQwxJxsyZM+1ev2PHjkanTp3+dEwu/T+fMWOGIclIT083DOOPTyMs7+vnn3+29W3atKnRu3dv49SpU8apU6eMAwcOGPfee68hyRgxYsSf1napsk4jLCgoKNXvrbfeMiQZO3bssG277bbbjMDAQOP48eO2bd9++61Ro0YNu7m2YMECQ5Jx6tQph2oDgEtxGiEAVIKPPvpIvr6+evTRR+22jx8/XoZhaNOmTXbbY2Ji7BYfaNKkifr376+kpKQKn+71Z3799Vf5+fmV2u7v729rr6i7775b7733ngoLC7V27Vr5+vqWOvIj/XZUZfPmzRowYICioqJs2yMiInT33Xfr008/VX5+viQpJCREGRkZ+vbbbx19a2W6+Lx169atUP+PPvpIXbp00Y033mjbVqdOHSUkJOiHH37Q119/bdf/X//6l2rWrGl73LVrV9uCHJfq2rWrjh07puLiYrvtvXr1UrNmzez6SdIdd9xhV/PF7UeOHJEkff755zp58qQeeeQR2/+dJPXr10+tW7cuddqjJD388MN2j//yl7/Ynq+iRo8erXr16mnGjBl/2nfq1KnasmVLqa/69evb9du8ebMaNWqkRo0aqV27dvqf//kf3XfffXr22Wcdqq0sAQEBtn+fP39eOTk56tatmyTZTocsKSnRxx9/rAEDBigyMtLW/+qrr9Ytt9xi93wXjxZ++OGHdqfQAoAjCFsAUAl+/PFHRUZGlvqgf3F1wh9//NFue1krAbZs2VIFBQU6depUpdQUEBBQ5nVZF1cPvPTD6Z8ZNGiQ8vLytGnTJq1atUq33nprmaHm1KlTKigoUKtWrUq1tWnTRlarVceOHZMkzZw5U2fOnFHLli3Vrl07PfbYY9q/f3+Fa/q9oKAgSb+d0lYRP/74Y7l1Xmy/VJMmTeweBwcHS5IaN25carvValVeXp7T3y9Jp0+ftqujrFpbt25dqk5/f/9Sp8TVq1fP9nwVFRwcrDFjxmjdunXau3fvH/Zt166d4uLiSn3VqlXLrl/Xrl21ZcsWJSYmat68eQoJCdHp06dL9XNGbm6uRo8erbCwMAUEBKhRo0Zq3ry5JNn+L06ePKlff/1VV199danv//22f/7zn7rhhhv0wAMPKCwsTIMGDdI777xD8ALgEMIWAHipiIiIMpeGv7jt0r/sV+S5YmNjNX/+fO3YsUN33333ZdfXo0cPff/991q+fLnatm2rV199Vdddd51effVVp56vdevWkmRbdKKylXcdWHnbjd8tjHK5319RFbleraJGjx6tkJCQCh3dqoiGDRsqLi5O8fHxGj9+vN5880198MEHeuGFFy77uQcOHKhly5bp4Ycf1nvvvafNmzcrMTFRkpwKSAEBAdqxY4c+/vhj3Xvvvdq/f7/++c9/6q9//WulHX0G4P0IWwBQCZo2baoTJ06UOqpy6NAhW/ulyjp17ptvvlFgYGCpoxLOuvbaa/XFF1+U+qC5Z88eBQYGqmXLlg493913362dO3cqKChIffv2LbNPo0aNFBgYqMOHD5dqO3TokHx8fOyO5NSvX1/33Xef3nrrLR07dkzt27e3u6dWRVZdvOjGG29UvXr19NZbb1Xow3DTpk3LrfNiuzu4WEdZtR4+fNjUOi8e3frwww//9OiWM/r166ebbrpJTz/9tM6dO+f085w+fVrJycl6/PHHNWPGDN1+++3661//ancqqySFhoba7h33e2Vt8/HxUa9evfTcc8/p66+/1lNPPaWtW7dq27ZtTtcKoHohbAFAJejbt69KSkr00ksv2W1fsGCBLBZLqetBUlJS7JbVPnbsmD788EP17t270o5M/OMf/1B2drbee+8927acnBy9++67uu2228q8nuvPnm/atGl6+eWXyz3ty9fXV71799aHH35ot3x5dna2Vq9erRtvvNF2ut/vl+SuU6eOrr76artTH2vXri1JOnPmzJ/WFxgYqEmTJungwYOaNGlSmUeG3nzzTaWmpkr67f8sNTVVKSkptvZz585p6dKlatasmaKjo//0NatC586dFRoaqsWLF9uNzaZNm3Tw4EH169fP1NcfM2aMQkJCNHPmTFOef9KkSfrvf/+rZcuWOf0cF/eZ3/+fP//886X6xcXF6YMPPtCJEyds27/77rtS11Xm5uaWep1rr71Wkip82wQAYOl3AKgEt912m26++Wb95z//0Q8//KAOHTpo8+bN+vDDDzVmzBhdddVVdv3btm2r+Ph4u6XfJVXodK3169fryy+/lPTbvar279+vJ598UpL0t7/9Te3bt5f0Wzjq1q2b7rvvPn399ddq2LChXn75ZZWUlDh1WlhwcLDdUafyPPnkk7b7Ez3yyCOqUaOGlixZogsXLmju3Lm2ftHR0YqNjVWnTp1Uv359ff7551q7dq1Gjhxp63NxEZFHH31U8fHx8vX11aBBg8p97ccee0wZGRmaP3++tm3bpn/84x8KDw9XVlaWPvjgA6WmpmrXrl2SpMcff1xvvfWWbrnlFj366KOqX7++Vq5cqczMTP3v//5vqSXzXaVmzZqaM2eO7rvvPt1000266667bEu/N2vWTGPHjjX19YODgzV69Og/nDM7d+60XQt4qfbt29vmY3luueUWtW3bVs8995xGjBhhtwhJRQUFBalHjx6aO3euioqKdMUVV2jz5s3KzMws1Xf69OnavHmzbrjhBg0fPtz2R5K2bdtq3759tn4zZ87Ujh071K9fPzVt2lQnT57Uyy+/rCuvvNJuURUA+EOuXAoRADxVWUuSnz171hg7dqwRGRlp1KxZ02jRooXx7LPPGlar1a6f/v9S12+++abRokULw8/Pz+jYsaOxbdu2Cr32xeW9y/q6dClswzCM3NxcY9iwYUaDBg2MwMBA46abbjLS0tIq9DqXLgNenrKWfjcMw/jiiy+M+Ph4o06dOkZgYKBx8803G7t27bLr8+STTxpdunQxQkJCjICAAKN169bGU089Zbe8enFxsTFq1CijUaNGhsViqfAy8GvXrjV69+5t1K9f36hRo4YRERFh/POf/zS2b99u1+/77783/vGPfxghISGGv7+/0aVLF2PDhg0Veo+vv/66IanUeE6bNq3UkuEqY3nzspZS/6PXW7NmjdGxY0fDz8/PqF+/vjF48GDjp59+suszZMgQo3bt2qXG42JNf6a8//PTp08bwcHBDi/9Pm3aNFvfpk2blrvU/4oVK8qcv+Upa+n3n376ybj99tuNkJAQIzg42LjzzjuNEydOlKrDMAwjOTnZ6Nixo1GrVi3jqquuMl599VVj/Pjxhr+/v12f/v37G5GRkUatWrWMyMhI46677jK++eabCtUIAIZhGBbDcPIKXACAUywWi0aMGFHqlEMArjNgwIBKvRUBAEhcswUAAKqZ399j7ttvv9VHH32k2NhY1xQEwGtxzRYAAKhWoqKiNHToUEVFRenHH3/UK6+8olq1amnixImuLg2AlyFsAQCAaqVPnz566623lJWVJT8/P8XExOjpp58u82bjAHA5uGYLAAAAAEzg0mu2duzYodtuu02RkZGyWCz64IMP7NoNw9DUqVMVERGhgIAAxcXFlbpwNTc3V4MHD1ZQUJBCQkI0bNgw/fLLL3Z99u/fr7/85S/y9/dX48aN7ZYeBgAAAAAzuDRsnTt3Th06dNCiRYvKbJ87d64WLlyoxYsXa8+ePapdu7bi4+Pt7uUxePBgZWRkaMuWLdqwYYN27NihhIQEW3t+fr569+6tpk2bKj09Xc8++6ymT5+upUuXmv7+AAAAAFRfbnMaocVi0fvvv68BAwZI+u2oVmRkpMaPH68JEyZIkvLy8hQWFqYVK1Zo0KBBOnjwoKKjo5WWlqbOnTtLkhITE9W3b1/99NNPioyM1CuvvKL//Oc/ysrKUq1atST9diPLDz74QIcOHapQbVarVSdOnFDdunVlsVgq/80DAAAA8AiGYejs2bOKjIyUj88fH7ty2wUyMjMzlZWVpbi4ONu24OBgde3aVSkpKRo0aJBSUlIUEhJiC1qSFBcXJx8fH+3Zs0e33367UlJS1KNHD1vQkqT4+HjNmTNHp0+fVr169Uq99oULF3ThwgXb4+PHjys6OtqkdwoAAADA0xw7dkxXXnnlH/Zx27CVlZUlSQoLC7PbHhYWZmvLyspSaGioXXuNGjVUv359uz7Nmzcv9RwX28oKW7Nnz9aMGTNKbT927JiCgoKcfEcAAAAAPF1+fr4aN26sunXr/mlftw1brjR58mSNGzfO9vjigAYFBRG2AAAAAFTo8iKXLpDxR8LDwyVJ2dnZdtuzs7NtbeHh4Tp58qRde3FxsXJzc+36lPUcl77G7/n5+dmCFQELAAAAgDPcNmw1b95c4eHhSk5Otm3Lz8/Xnj17FBMTI0mKiYnRmTNnlJ6ebuuzdetWWa1Wde3a1dZnx44dKioqsvXZsmWLWrVqVeYphAAAAABQGVwatn755Rft27dP+/btk/Tbohj79u3T0aNHZbFYNGbMGD355JNat26dDhw4oH/961+KjIy0rVjYpk0b9enTRw8++KBSU1P12WefaeTIkRo0aJAiIyMlSXfffbdq1aqlYcOGKSMjQ2vWrNELL7xgd5ogAAAAAFQ2ly79vn37dt18882ltg8ZMkQrVqyQYRiaNm2ali5dqjNnzujGG2/Uyy+/rJYtW9r65ubmauTIkVq/fr18fHx0xx13aOHChapTp46tz/79+zVixAilpaWpYcOGGjVqlCZNmlThOvPz8xUcHKy8vDxOKQQAAACqMUeygdvcZ8udEbYAAAAASI5lA7e9ZgsAAAAAPBlhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMUMPVBVRXzR7f6OoSyvXDM/1cXQIAAADg8TiyBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJ3DpslZSUaMqUKWrevLkCAgJ01VVXadasWTIMw9bHMAxNnTpVERERCggIUFxcnL799lu758nNzdXgwYMVFBSkkJAQDRs2TL/88ktVvx0AAAAA1Yhbh605c+bolVde0UsvvaSDBw9qzpw5mjt3rl588UVbn7lz52rhwoVavHix9uzZo9q1ays+Pl7nz5+39Rk8eLAyMjK0ZcsWbdiwQTt27FBCQoIr3hIAAACAasJiXHqYyM3ceuutCgsL02uvvWbbdscddyggIEBvvvmmDMNQZGSkxo8frwkTJkiS8vLyFBYWphUrVmjQoEE6ePCgoqOjlZaWps6dO0uSEhMT1bdvX/3000+KjIz80zry8/MVHBysvLw8BQUFVcp7a/b4xkp5HjP88Ew/V5cAAAAAuCVHsoFbH9nq3r27kpOT9c0330iSvvzyS3366ae65ZZbJEmZmZnKyspSXFyc7XuCg4PVtWtXpaSkSJJSUlIUEhJiC1qSFBcXJx8fH+3Zs6fM171w4YLy8/PtvgAAAADAETVcXcAfefzxx5Wfn6/WrVvL19dXJSUleuqppzR48GBJUlZWliQpLCzM7vvCwsJsbVlZWQoNDbVrr1GjhurXr2/r83uzZ8/WjBkzKvvtAAAAAKhG3PrI1jvvvKNVq1Zp9erV+uKLL7Ry5UrNmzdPK1euNPV1J0+erLy8PNvXsWPHTH09AAAAAN7HrY9sPfbYY3r88cc1aNAgSVK7du30448/avbs2RoyZIjCw8MlSdnZ2YqIiLB9X3Z2tq699lpJUnh4uE6ePGn3vMXFxcrNzbV9/+/5+fnJz8/PhHcEAAAAoLpw6yNbBQUF8vGxL9HX11dWq1WS1Lx5c4WHhys5OdnWnp+frz179igmJkaSFBMTozNnzig9Pd3WZ+vWrbJareratWsVvAsAAAAA1ZFbH9m67bbb9NRTT6lJkya65pprtHfvXj333HO6//77JUkWi0VjxozRk08+qRYtWqh58+aaMmWKIiMjNWDAAElSmzZt1KdPHz344INavHixioqKNHLkSA0aNKhCKxECAAAAgDPcOmy9+OKLmjJlih555BGdPHlSkZGReuihhzR16lRbn4kTJ+rcuXNKSEjQmTNndOONNyoxMVH+/v62PqtWrdLIkSPVq1cv+fj46I477tDChQtd8ZYAAAAAVBNufZ8td8F9tgAAAABIXnSfLQAAAADwVIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAENVxdAFBRzR7f6OoSyvXDM/1cXQIAAADcDEe2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAEzg9mHr+PHjuueee9SgQQMFBASoXbt2+vzzz23thmFo6tSpioiIUEBAgOLi4vTtt9/aPUdubq4GDx6soKAghYSEaNiwYfrll1+q+q0AAAAAqEbcOmydPn1aN9xwg2rWrKlNmzbp66+/1vz581WvXj1bn7lz52rhwoVavHix9uzZo9q1ays+Pl7nz5+39Rk8eLAyMjK0ZcsWbdiwQTt27FBCQoIr3hIAAACAaqKGqwv4I3PmzFHjxo31+uuv27Y1b97c9m/DMPT888/riSeeUP/+/SVJb7zxhsLCwvTBBx9o0KBBOnjwoBITE5WWlqbOnTtLkl588UX17dtX8+bNU2RkZNW+KQAAAADVglsf2Vq3bp06d+6sO++8U6GhoerYsaOWLVtma8/MzFRWVpbi4uJs24KDg9W1a1elpKRIklJSUhQSEmILWpIUFxcnHx8f7dmzp+reDAAAAIBqxa3D1pEjR/TKK6+oRYsWSkpK0vDhw/Xoo49q5cqVkqSsrCxJUlhYmN33hYWF2dqysrIUGhpq116jRg3Vr1/f1uf3Lly4oPz8fLsvAAAAAHCEW59GaLVa1blzZz399NOSpI4dO+qrr77S4sWLNWTIENNed/bs2ZoxY4Zpzw8AAADA+7n1ka2IiAhFR0fbbWvTpo2OHj0qSQoPD5ckZWdn2/XJzs62tYWHh+vkyZN27cXFxcrNzbX1+b3JkycrLy/P9nXs2LFKeT8AAAAAqg+3Dls33HCDDh8+bLftm2++UdOmTSX9tlhGeHi4kpOTbe35+fnas2ePYmJiJEkxMTE6c+aM0tPTbX22bt0qq9Wqrl27lvm6fn5+CgoKsvsCAAAAAEe49WmEY8eOVffu3fX0009r4MCBSk1N1dKlS7V06VJJksVi0ZgxY/Tkk0+qRYsWat68uaZMmaLIyEgNGDBA0m9Hwvr06aMHH3xQixcvVlFRkUaOHKlBgwaxEiEAAAAA07h12Lr++uv1/vvva/LkyZo5c6aaN2+u559/XoMHD7b1mThxos6dO6eEhASdOXNGN954oxITE+Xv72/rs2rVKo0cOVK9evWSj4+P7rjjDi1cuNAVbwkAAABANWExDMNwdRHuLj8/X8HBwcrLy6u0UwqbPb6xUp7HDD8808/VJZSJMQMAAICrOZIN3PqaLQAAAADwVIQtAAAAADCBW1+zBeDycfolAACAa3BkCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATOBW2oqKi9N///rfU9jNnzigqKuqyiwIAAAAAT+dU2Prhhx9UUlJSavuFCxd0/Pjxyy4KAAAAADxdDUc6r1u3zvbvpKQkBQcH2x6XlJQoOTlZzZo1q7TiAAAAAMBTORS2BgwYIEmyWCwaMmSIXVvNmjXVrFkzzZ8/v9KKAwAAAABP5VDYslqtkqTmzZsrLS1NDRs2NKUoAAAAAPB0DoWtizIzMyu7DgAAAADwKk6FLUlKTk5WcnKyTp48aTviddHy5csvuzAAAAAA8GROha0ZM2Zo5syZ6ty5syIiImSxWCq7LgAAAADwaE6FrcWLF2vFihW69957K7seAAAAAPAKTt1nq7CwUN27d6/sWgAAAADAazgVth544AGtXr26smsBAAAAAK/h1GmE58+f19KlS/Xxxx+rffv2qlmzpl37c889VynFAQAAAICncips7d+/X9dee60k6auvvrJrY7EMAAAAAHAybG3btq2y6wAAAAAAr+LUNVsAAAAAgD/m1JGtm2+++Q9PF9y6davTBQEAAACAN3AqbF28XuuioqIi7du3T1999ZWGDBlSGXUBAAAAgEdzKmwtWLCgzO3Tp0/XL7/8clkFAQAAAIA3qNRrtu655x4tX768Mp8SAAAAADxSpYatlJQU+fv7V+ZTAgAAAIBHcuo0wr///e92jw3D0M8//6zPP/9cU6ZMqZTCAAAAAMCTORW2goOD7R77+PioVatWmjlzpnr37l0phQEAAACAJ3MqbL3++uuVXQcAAAAAeBWnwtZF6enpOnjwoCTpmmuuUceOHSulKAAAAADwdE6FrZMnT2rQoEHavn27QkJCJElnzpzRzTffrLfffluNGjWqzBoBAAAAwOM4tRrhqFGjdPbsWWVkZCg3N1e5ubn66quvlJ+fr0cffbSyawQAAAAAj+PUka3ExER9/PHHatOmjW1bdHS0Fi1axAIZAAAAACAnj2xZrVbVrFmz1PaaNWvKarVedlEAAAAA4OmcCls9e/bU6NGjdeLECdu248ePa+zYserVq1elFQcAAAAAnsqpsPXSSy8pPz9fzZo101VXXaWrrrpKzZs3V35+vl588cXKrhEAAAAAPI5T12w1btxYX3zxhT7++GMdOnRIktSmTRvFxcVVanEAAAAA4KkcOrK1detWRUdHKz8/XxaLRX/96181atQojRo1Stdff72uueYa7dy506xaAQAAAMBjOBS2nn/+eT344IMKCgoq1RYcHKyHHnpIzz33XKUVBwAAAACeyqGw9eWXX6pPnz7ltvfu3Vvp6emXXRQAAAAAeDqHwlZ2dnaZS75fVKNGDZ06deqyiwIAAAAAT+dQ2Lriiiv01Vdfldu+f/9+RUREXHZRAAAAAODpHApbffv21ZQpU3T+/PlSbb/++qumTZumW2+9tdKKAwAAAABP5dDS70888YTee+89tWzZUiNHjlSrVq0kSYcOHdKiRYtUUlKi//znP6YUCgAAAACexKGwFRYWpl27dmn48OGaPHmyDMOQJFksFsXHx2vRokUKCwszpVAAAAAA8CQO39S4adOm+uijj3T69Gl99913MgxDLVq0UL169cyoDwAAAAA8ksNh66J69erp+uuvr8xaAAAAAMBrOLRABgAAAACgYghbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJqjh6gIc8cwzz2jy5MkaPXq0nn/+eUnS+fPnNX78eL399tu6cOGC4uPj9fLLLyssLMz2fUePHtXw4cO1bds21alTR0OGDNHs2bNVo4ZHvX0AVaTZ4xtdXUK5fnimn6tLAAAAFeQxR7bS0tK0ZMkStW/f3m772LFjtX79er377rv65JNPdOLECf3973+3tZeUlKhfv34qLCzUrl27tHLlSq1YsUJTp06t6rcAAAAAoBrxiLD1yy+/aPDgwVq2bJnq1atn256Xl6fXXntNzz33nHr27KlOnTrp9ddf165du7R7925J0ubNm/X111/rzTff1LXXXqtbbrlFs2bN0qJFi1RYWOiqtwQAAADAy3lE2BoxYoT69eunuLg4u+3p6ekqKiqy2966dWs1adJEKSkpkqSUlBS1a9fO7rTC+Ph45efnKyMjo8zXu3DhgvLz8+2+AAAAAMARbn/R0ttvv60vvvhCaWlppdqysrJUq1YthYSE2G0PCwtTVlaWrc+lQeti+8W2ssyePVszZsyohOoBAAAAVFdufWTr2LFjGj16tFatWiV/f/8qe93JkycrLy/P9nXs2LEqe20AAAAA3sGtw1Z6erpOnjyp6667TjVq1FCNGjX0ySefaOHChapRo4bCwsJUWFioM2fO2H1fdna2wsPDJUnh4eHKzs4u1X6xrSx+fn4KCgqy+wIAAAAAR7h12OrVq5cOHDigffv22b46d+6swYMH2/5ds2ZNJScn277n8OHDOnr0qGJiYiRJMTExOnDggE6ePGnrs2XLFgUFBSk6OrrK3xMAAACA6sGtr9mqW7eu2rZta7etdu3aatCggW37sGHDNG7cONWvX19BQUEaNWqUYmJi1K1bN0lS7969FR0drXvvvVdz585VVlaWnnjiCY0YMUJ+fn5V/p4AAAAAVA9uHbYqYsGCBfLx8dEdd9xhd1Pji3x9fbVhwwYNHz5cMTExql27toYMGaKZM2e6sGoAAAAA3s7jwtb27dvtHvv7+2vRokVatGhRud/TtGlTffTRRyZXBgAAAAD/x62v2QIAAAAAT0XYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwAQ1XF0AAMA7NHt8o6tLKNcPz/RzdQllYswAwLtxZAsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAExA2AIAAAAAExC2AAAAAMAE3GcLAAB4FO5PBsBTcGQLAAAAAExA2AIAAAAAExC2AAAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtgAAAADABNxnCwAAwMtxbzLANTiyBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACao4eoCAAAAAHfU7PGNri6hXD8808/VJaACCFsAAAAAKgUB1R6nEQIAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACtw5bs2fP1vXXX6+6desqNDRUAwYM0OHDh+36nD9/XiNGjFCDBg1Up04d3XHHHcrOzrbrc/ToUfXr10+BgYEKDQ3VY489puLi4qp8KwAAAACqGbcOW5988olGjBih3bt3a8uWLSoqKlLv3r117tw5W5+xY8dq/fr1evfdd/XJJ5/oxIkT+vvf/25rLykpUb9+/VRYWKhdu3Zp5cqVWrFihaZOneqKtwQAAACgmqjh6gL+SGJiot3jFStWKDQ0VOnp6erRo4fy8vL02muvafXq1erZs6ck6fXXX1ebNm20e/dudevWTZs3b9bXX3+tjz/+WGFhYbr22ms1a9YsTZo0SdOnT1etWrVc8dYAAAAAeDm3PrL1e3l5eZKk+vXrS5LS09NVVFSkuLg4W5/WrVurSZMmSklJkSSlpKSoXbt2CgsLs/WJj49Xfn6+MjIyynydCxcuKD8/3+4LAAAAABzhMWHLarVqzJgxuuGGG9S2bVtJUlZWlmrVqqWQkBC7vmFhYcrKyrL1uTRoXWy/2FaW2bNnKzg42PbVuHHjSn43AAAAALydx4StESNG6KuvvtLbb79t+mtNnjxZeXl5tq9jx46Z/poAAAAAvItbX7N10ciRI7Vhwwbt2LFDV155pW17eHi4CgsLdebMGbujW9nZ2QoPD7f1SU1NtXu+i6sVXuzze35+fvLz86vkdwEAAACgOnHrI1uGYWjkyJF6//33tXXrVjVv3tyuvVOnTqpZs6aSk5Nt2w4fPqyjR48qJiZGkhQTE6MDBw7o5MmTtj5btmxRUFCQoqOjq+aNAAAAAKh23PrI1ogRI7R69Wp9+OGHqlu3ru0aq+DgYAUEBCg4OFjDhg3TuHHjVL9+fQUFBWnUqFGKiYlRt27dJEm9e/dWdHS07r33Xs2dO1dZWVl64oknNGLECI5eAQAAADCNW4etV155RZIUGxtrt/3111/X0KFDJUkLFiyQj4+P7rjjDl24cEHx8fF6+eWXbX19fX21YcMGDR8+XDExMapdu7aGDBmimTNnVtXbAAAAAFANuXXYMgzjT/v4+/tr0aJFWrRoUbl9mjZtqo8++qgySwMAAACAP+TW12wBAAAAgKcibAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJiBsAQAAAIAJCFsAAAAAYALCFgAAAACYgLAFAAAAACYgbAEAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAAABgAsIWAAAAAJiAsAUAAAAAJqhWYWvRokVq1qyZ/P391bVrV6Wmprq6JAAAAABeqtqErTVr1mjcuHGaNm2avvjiC3Xo0EHx8fE6efKkq0sDAAAA4IWqTdh67rnn9OCDD+q+++5TdHS0Fi9erMDAQC1fvtzVpQEAAADwQjVcXUBVKCwsVHp6uiZPnmzb5uPjo7i4OKWkpJTqf+HCBV24cMH2OC8vT5KUn59faTVZLxRU2nNVtsp8n5WJMXMO4+Y4xsw5jJvjGDPnMG6OY8ycw7g5rjqM2cXnMQzjT/tajIr08nAnTpzQFVdcoV27dikmJsa2feLEifrkk0+0Z88eu/7Tp0/XjBkzqrpMAAAAAB7i2LFjuvLKK/+wT7U4suWoyZMna9y4cbbHVqtVubm5atCggSwWiwsrKy0/P1+NGzfWsWPHFBQU5OpyPAbj5jjGzDmMm+MYM+cwbo5jzJzDuDmOMXOOu46bYRg6e/asIiMj/7RvtQhbDRs2lK+vr7Kzs+22Z2dnKzw8vFR/Pz8/+fn52W0LCQkxs8TLFhQU5FaT0FMwbo5jzJzDuDmOMXMO4+Y4xsw5jJvjGDPnuOO4BQcHV6hftVggo1atWurUqZOSk5Nt26xWq5KTk+1OKwQAAACAylItjmxJ0rhx4zRkyBB17txZXbp00fPPP69z587pvvvuc3VpAAAAALxQtQlb//znP3Xq1ClNnTpVWVlZuvbaa5WYmKiwsDBXl3ZZ/Pz8NG3atFKnPeKPMW6OY8ycw7g5jjFzDuPmOMbMOYyb4xgz53jDuFWL1QgBAAAAoKpVi2u2AAAAAKCqEbYAAAAAwASELQAAAAAwAWELAAAAAExQbVYj9GT5+fkOf4+73fjNFRg3VBXmmuMYM8C9sY+iqnj7XGM1Qg/g4+Mji8VS4f4Wi0XffPONoqKiTKzK/TFujvv73//u8PcsXrxYoaGhJlTjOZhrjmPMnDNu3DiHv+eJJ55Q/fr1TajGMzBmzmEfdRxzzTnePtcIWx7Ax8dH//u//1uhndEwDPXt21dfffWVx0xCszBujvPx8dHAgQMVEBBQof6rV6/WwYMHq/WYScw1ZzBmzvHx8VFMTIxq1apVof6ffvqpDh8+XK3HjTFzDvuo45hrzvH2ucZphB6gadOm6tGjhxo0aFCh/lFRUapZs6bJVbk/xs05CxcurPCRqrVr15pcjWdgrjmOMXPe+++/X+F9tG7duiZX4xkYM8exjzqHueY4b59rHNkCYPPJJ5/ohhtuUI0aFfs7zKeffqrrr7/eo+/sDniSlStXatCgQRXe51avXq3+/furdu3aJlfmvhgzVBXmGspC2AIAAAAAE3AaoQfJycnR8uXLlZKSoqysLElSeHi4unfvrqFDh6pRo0YurtA9MW7OycrK0p49e+zGrGvXrgoPD3dxZe6LueY4xsx5xcXFysjIsBu36Ohojzq9pqoxZo5jH3UOc81x3jrXOLLlIdLS0hQfH6/AwEDFxcUpLCxMkpSdna3k5GQVFBQoKSlJnTt3dnGl7oVxc9y5c+f00EMP6e2335bFYrFdsJqbmyvDMHTXXXdpyZIlCgwMdHGl7oW55jjGzDlWq1VTp07VokWLlJeXZ9cWHByskSNHasaMGfLx4VaaFzFmzmEfdRxzzTlePdcMeISuXbsaCQkJhtVqLdVmtVqNhIQEo1u3bi6ozL0xbo4bNmyY0aJFCyMxMdEoLi62bS8uLjaSkpKMli1bGg888IALK3RPzDXHMWbOeeyxx4xGjRoZixcvNjIzM42CggKjoKDAyMzMNJYsWWKEhoYaEydOdHWZboUxcw77qOOYa87x5rlG2PIQ/v7+xsGDB8ttP3jwoOHv71+FFXkGxs1xISEhxmeffVZu+6effmqEhIRUYUWegbnmOMbMOWFhYUZiYmK57YmJiUZoaGgVVuT+GDPnsI86jrnmHG+eaxzD9BDh4eFKTU0ttz01NdV2yBX/h3FznNVq/cN7hNSqVUtWq7UKK/IMzDXHMWbOOXv2rCIjI8ttj4iI0Llz56qwIvfHmDmHfdRxzDXnePNcY4EMDzFhwgQlJCQoPT1dvXr1KnUu67JlyzRv3jwXV+l+GDfH3XrrrUpISNBrr72mjh072rXt3btXw4cP12233eai6twXc81xjJlzYmNjNWHCBK1atUoNGza0a8vJydGkSZMUGxvrmuLcFGPmHPZRxzHXnOPVc83Vh9ZQcW+//bbRtWtXo0aNGobFYjEsFotRo0YNo2vXrsaaNWtcXZ7bYtwck5uba/Tp08ewWCxG/fr1jdatWxutW7c26tevb/j4+Bi33HKLcfr0aVeX6ZaYa45jzBx39OhRo23btkaNGjWMjh07Gn369DH69OljdOzY0ahRo4bRvn174+jRo64u060wZs5jH3UMc8153jrXWI3QAxUVFSknJ0eS1LBhQ5YRrSDGzTGHDh0qtfxqTEyMWrdu7eLK3B9zzXGMmWOsVquSkpK0e/fuUvto7969WemsDIzZ5WEfrTjm2uXxtrlG2AIAAAAAExCtAQAAAMAEhC0AAAAAMAFhCwAAAABMQNgCAAAAABMQtrzIjh07lJeX5+oyPA7jhqrCXHMcYwa4N/ZRVBVPnWuELS8SGxurqKgozZ8/39WleBTGzXHNmzfXsGHDdOLECVeX4lGYa45jzJzTs2dPzZo1SwUFBa4uxWMwZs5hH3Ucc805njrXCFteJDMzU2vXrlV2drarS/EojJvjhgwZopKSEt1www2uLsWjMNccx5g5p0mTJkpOTua+eA5gzJzDPuo45ppzPHWucZ8tAAC8VH5+voKCglxdhkdhzFBVmGvVA2HLwxUVFXn8nbWrQnFxsTIyMuzu5B4dHc3YodIx1y5fdna2Lly4oCZNmri6FADi5xqqjjfOtRquLgAV884772jAgAGqVauWJOmll17Ss88+q59++kn16tXTo48+qqlTp7q4SvdjtVo1depULVq0qNRFlcHBwRo5cqRmzJghHx/OqL3U119/rZdeekkpKSl2P/BiYmI0cuRIRUdHu7hC98Ncc9zZs2c1fPhw7dy5U7GxsVq2bJnGjh2rV155RRaLRTfeeKPWr1/PX37LkJOTo+XLl5faR7t3766hQ4eqUaNGLq7Q/TBmjuPnmnOYa47z5rnGkS0P4evrq59//lmhoaF6/fXX9cgjj2jixInq2rWr9u7dq9mzZ+v555/XAw884OpS3crEiRO1YsUKzZo1S/Hx8QoLC5P021/ON2/erClTpmjo0KGaM2eOiyt1H5s2bdKAAQN03XXXlRqzLVu2KD09XR9++KHi4+NdXKl7Ya45btSoUfr444/1yCOP6L333lNwcLC+//57LV68WCUlJRo+fLgGDBigp556ytWlupW0tDTFx8crMDBQcXFxdnMtOTlZBQUFSkpKUufOnV1cqftgzJzDzzXHMdec49VzzYBHsFgsRnZ2tmEYhtGlSxdj7ty5du0vv/yy0bFjR1eU5tbCwsKMxMTEctsTExON0NDQKqzI/bVv396YMmVKue3Tpk0z2rVrV4UVeQbmmuMaN25sbN261TAMwzh+/LhhsViM9evX29o3bNhgtGrVylXlua2uXbsaCQkJhtVqLdVmtVqNhIQEo1u3bi6ozH0xZs7h55rjmGvO8ea55nnH4qoxi8UiSTpy5Ih69+5t19a7d2999913rijLrZ09e1aRkZHltkdEROjcuXNVWJH7++abbzR48OBy2++66y59++23VViRZ2CuOe7kyZO6+uqrJUmRkZEKCAhQy5Ytbe1t27bVsWPHXFWe2/ryyy81duxY2++ES1ksFo0dO1b79u2r+sLcGGPmHH6uOY655hxvnmuELQ+SmJiodevWyd/fv9S9Gc6fP1/mjl3dxcbGasKECcrJySnVlpOTo0mTJik2NrbqC3NjzZo108aNG8tt37hxo5o2bVqFFXkG5prjGjRooFOnTtke9+/fXyEhIbbHv/zyi/z8/FxQmXsLDw9Xampque2pqam2U3DwG8bMOfxccxxzzTnePNdYIMODDBkyxPbvrVu3KiYmxvZ49+7duuqqq1xRlltbvHix+vbtq4iICLVr187uHOADBw4oOjpaGzZscHGV7mXmzJm6++67tX379jLPN09MTNTq1atdXKX7Ya45rn379kpLS9N1110nSaXmVVpamtq0aeOK0tzahAkTlJCQoPT0dPXq1avUPrps2TLNmzfPxVW6F8bMOfxccxxzzTnePNdYIMNLbNiwQTVr1mTRgjJYrVYlJSVp9+7dpVbW6927t0eubGO2Xbt2aeHChWWuRjh69Gi7oI//w1xzTG5urnx8fOyOZl1q06ZNCggI8Ni/ZpppzZo1WrBggdLT01VSUiLpt4WUOnXqpHHjxmngwIEurtD9MGbO4eea45hrzvHWuUbYAgDAQxUVFdlOu2nYsKFH34umqjBmqCrMNUiELY+wf/9+tW3btsKJPiMjQ61atVKNGpwlClQF9lEAAFAWwpYH8PX1VVZWVoVvghcUFKR9+/YpKirK5MoASOyjAACgbPxZ1QMYhqEpU6YoMDCwQv0LCwtNrgjApdhHAQBAWQhbHqBHjx46fPhwhfvHxMQoICDAxIoAXIp9FAAAlIXTCAEAAADABJ65hiJQiXr27KlZs2aVulE0yvfGG2/o+++/d3UZqAbYP52zY8cO5eXluboMj8KYOYd91HHMNed46lwjbKHaa9KkiZKTk9W6dWtXl+Ixhg4dqujoaI0aNcrVpcDLsX86JzY2VlFRUZo/f76rS/EYjJlz2Ecdx1xzjqfONa7ZQrW3YsUKSVJ+fr5rC/EgVqtVmZmZ2rRpk6tLgZdj/3ROZmamjhw5wj7qAMbMOeyjjmOuOcdT5xrXbAEAAACACTiyBa+Xk5Oj5cuXKyUlRVlZWZKk8PBwde/eXUOHDq3wvZGqs6KiIv3www8KDQ1VcHCwq8uBF2H/dF5xcbEyMjLsxi06Olo1a9Z0cWXuizFzHPuoc5hrjvPWucaRLXi1tLQ0xcfHKzAwUHFxcQoLC5MkZWdnKzk5WQUFBUpKSlLnzp1dXKn7mDt3rkaNGqWAgACVlJRo0qRJevHFF1VcXCwfHx/de++9WrJkCb8wcNnYP51jtVo1depULVq0qNRF9sHBwRo5cqRmzJghHx8uy76IMXMO+6jjmGvO8eq5ZgBerGvXrkZCQoJhtVpLtVmtViMhIcHo1q2bCypzXz4+PkZ2drZhGIbx7LPPGvXq1TOWL19uZGRkGG+++aYRGhpqzJkzx8VVwhuwfzrnscceMxo1amQsXrzYyMzMNAoKCoyCggIjMzPTWLJkiREaGmpMnDjR1WW6FcbMOeyjjmOuOceb5xphC17N39/fOHjwYLntBw8eNPz9/auwIvdnsVhsYatjx47GkiVL7NrffPNN45prrnFFafAy7J/OCQsLMxITE8ttT0xMNEJDQ6uwIvfHmDmHfdRxzDXnePNc4xgmvFp4eLhSU1PLbU9NTbUdqsb/sVgskqSjR4+qe/fudm3du3dXZmamK8qCl2H/dM7Zs2cVGRlZbntERITOnTtXhRW5P8bMOeyjjmOuOceb5xoLZMCrTZgwQQkJCUpPT1evXr1KnQO8bNkyzZs3z8VVup9ly5apTp06qlWrlnJzc+3azp49Kz8/PxdVBm/C/umc2NhYTZgwQatWrVLDhg3t2nJycjRp0iTFxsa6pjg3xZg5h33Uccw153jzXGOBDHi9NWvWaMGCBUpPT1dJSYkkydfXV506ddK4ceM0cOBAF1foXpo1a2Y7siVJo0eP1pgxY2yPX3jhBb399ttKSUlxQXXwNuyfjjt27Jj69u2rQ4cOqV27dnYfSg4cOKDo6Ght2LBBjRs3dnGl7oMxcx77qGOYa87z1rlG2EK1UVRUpJycHElSw4YNWU3PSbt375afn586duzo6lLgRdg/HWO1WpWUlKTdu3fbLZEcExOj3r17s9JZGRizy8M+WnHMtcvjbXONsAUAAAAAJiBaAwAAr7V//35ZrdYK98/IyFBxcbGJFQGoTjiyBQAAvJavr6+ysrLUqFGjCvUPCgrSvn37FBUVZXJlAKoDViMEAABeyzAMTZkyRYGBgRXqX1hYaHJFAKoTwhYAAPBaPXr00OHDhyvcPyYmRgEBASZWBKA64TRCVHs7duxQhw4dFBwc7OpSAPwO+yfg3thHUVU8da6xQAaqvdjYWEVFRWn+/PmuLsVjNG/eXMOGDdOJEydcXQq8HPunc3r27KlZs2apoKDA1aXAy7GPOo790zmeOtcIW6j2MjMztXbtWmVnZ7u6FI8xZMgQlZSU6IYbbnB1KfBy7J/OadKkiZKTk9W6dWtXlwIvxz7qOPZP53jqXOM0QgAAvFR+fr6CgoJcXQaAMrB/Vg+ELVQLxcXFysjIsLuTe3R0tMfflRzwRtnZ2bpw4YKaNGni6lIAiN+hqDreONdYjRBezWq1aurUqVq0aJHy8vLs2oKDgzVy5EjNmDFDPj6cUXupr7/+Wi+99JJSUlLsfuDFxMRo5MiRio6OdnGF8AZnz57V8OHDtXPnTsXGxmrZsmUaO3asXnnlFVksFt14441av349f/ktQ05OjpYvX15qH+3evbuGDh1a4XtKAX+E36HOYf90nDfPNY5swatNnDhRK1as0KxZsxQfH6+wsDBJv/3lfPPmzZoyZYqGDh2qOXPmuLhS97Fp0yYNGDBA1113Xakx27Jli9LT0/Xhhx8qPj7exZXC040aNUoff/yxHnnkEb333nsKDg7W999/r8WLF6ukpETDhw/XgAED9NRTT7m6VLeSlpam+Ph4BQYGKi4uzm4fTU5OVkFBgZKSktS5c2cXVwpPx+9Qx7F/Oser55oBeLGwsDAjMTGx3PbExEQjNDS0Cityf+3btzemTJlSbvu0adOMdu3aVWFF8FaNGzc2tm7dahiGYRw/ftywWCzG+vXrbe0bNmwwWrVq5ary3FbXrl2NhIQEw2q1lmqzWq1GQkKC0a1bNxdUBm/D71DHsX86x5vnmucdiwMccPbsWUVGRpbbHhERoXPnzlVhRe7vm2++0eDBg8ttv+uuu/Ttt99WYUXwVidPntTVV18tSYqMjFRAQIBatmxpa2/btq2OHTvmqvLc1pdffqmxY8fKYrGUarNYLBo7dqz27dtX9YXB6/A71HHsn87x5rlG2IJXi42N1YQJE5STk1OqLScnR5MmTVJsbGzVF+bGmjVrpo0bN5bbvnHjRjVt2rQKK4K3atCggU6dOmV73L9/f4WEhNge//LLL/Lz83NBZe4tPDxcqamp5banpqbaTsEBLge/Qx3H/ukcb55rLJABr7Z48WL17dtXERERateund05wAcOHFB0dLQ2bNjg4irdy8yZM3X33Xdr+/btZZ5vnpiYqNWrV7u4SniD9u3bKy0tTdddd50klZpXaWlpatOmjStKc2sTJkxQQkKC0tPT1atXr1L76LJlyzRv3jwXVwlvwO9Qx7F/Oseb5xoLZMDrWa1WJSUlaffu3aVW1uvdu7dHrmxjtl27dmnhwoVlrkY4evRoxcTEuLhCeIPc3Fz5+PjYHc261KZNmxQQEOCxf80005o1a7RgwQKlp6erpKREkuTr66tOnTpp3LhxGjhwoIsrhLfgd6jj2D+d461zjbAFAICHKioqsp1207BhQ4++Fw3gbdg/IXHNFgDABfbv3y+r1Vrh/hkZGSouLjaxIs9Us2ZNRUREKCIigg9yqFTso5eP/RMSR7YAAC7g6+urrKysCt/cMygoSPv27VNUVJTJlQGQ2EeBysICGQCAKmcYhqZMmaLAwMAK9S8sLDS5IgCXYh8FKgdhCwBQ5Xr06KHDhw9XuH9MTIwCAgJMrAjApdhHgcrBaYQAAAAAYAIWyEC117NnT82aNUsFBQWuLsVjvPHGG/r+++9dXQaAcuzYsUN5eXmuLgNAGdg/neOpn9cIW6j2mjRpouTkZLVu3drVpXiMoUOHKjo6WqNGjXJ1KQDKEBsbq6ioKM2fP9/VpQD4HfZP53jq5zWu2UK1t2LFCklSfn6+awvxIFarVZmZmdq0aZOrSwFQhszMTB05coR9FHBD7J/O8dTPa1yzBQAAAAAm4MgWvF5OTo6WL1+ulJQUZWVlSZLCw8PVvXt3DR06tML3EAEAd1JcXKyMjAy7n2vR0dHcPBVwA+yfjvPWz2sc2YJXS0tLU3x8vAIDAxUXF6ewsDBJUnZ2tpKTk1VQUKCkpCR17tzZxZW6j6KiIv3nP//Re++9p/r16+vhhx/W/fffb2vPzs5WZGSkSkpKXFglUH1ZrVZNnTpVixYtKnWRfXBwsEaOHKkZM2bIx4fLsoGqxv7pHG/+vEbYglfr1q2bOnTooMWLF8tisdi1GYahhx9+WPv371dKSoqLKnQ/06dP1+LFizVhwgSdOXNGL730kv75z39qyZIlkn77wRcRESGr1eriSoHqaeLEiVqxYoVmzZql+Ph4uw8lmzdv1pQpUzR06FDNmTPHxZUC1Q/7p3O8+fMaYQteLSAgQHv37i135ZpDhw6pY8eO+vXXX6u4MvfVokULLViwQLfeeqsk6bvvvtMtt9yiG2+8UcuXL9fJkyc5sgW4UHh4uFauXKn4+Pgy25OSkvSvf/1L2dnZVVwZAPZP53jz5zWOYcKrhYeHKzU1tdz21NRU21+d8Jvjx4+rbdu2tsdXX321tm/frl27dunee+8lZAEudvbsWUVGRpbbHhERoXPnzlVhRQAuYv90jjd/XmOBDHi1CRMmKCEhQenp6erVq1epc4CXLVumefPmubhK9xIeHq7vv/9ezZo1s2274oortG3bNt18880aOnSoy2oD8Ns9eiZMmKBVq1apYcOGdm05OTmaNGmSYmNjXVMcUM2xfzrHmz+vcRohvN6aNWu0YMECpaen247K+Pr6qlOnTho3bpwGDhzo4grdywMPPCDDMPTaa6+Vajt+/LhiY2N15MgRjnABLnLs2DH17dtXhw4dUrt27ew+lBw4cEDR0dHasGGDGjdu7OJKgeqH/dN53vp5jbCFaqOoqEg5OTmSpIYNG7L8ajl+/PFHHTp0qNzzzU+cOKEtW7ZoyJAhVVwZgIusVquSkpK0e/duuyWSY2Ji1Lt3b1Y6A1yI/fPyeNvnNcIWAAAAAJiAaA0AAABcpv379zt0W5SMjAwVFxebWBHcAUe2AAAAgMvk6+urrKwsNWrUqEL9g4KCtG/fPkVFRZlcGVyJ1QgBAACAy2QYhqZMmaLAwMAK9S8sLDS5IrgDwhYAAABwmXr06KHDhw9XuH9MTIwCAgJMrAjugNMIUe3t2LFDHTp0UHBwsKtLAQAAQBk89fMaC2Sg2ouNjVVUVJTmz5/v6lI8RvPmzTVs2DCdOHHC1aUAKEPPnj01a9YsFRQUuLoUAKgUnvp5jbCFai8zM1Nr165Vdna2q0vxGEOGDFFJSYluuOEGV5cCoAxNmjRRcnKyWrdu7epSAKBSeOrnNU4jBADAS+Xn5ysoKMjVZQBAtUXYQrVQXFysjIwMuzu5R0dHe/xdyQEAALyFN35eYzVCeDWr1aqpU6dq0aJFysvLs2sLDg7WyJEjNWPGDPn4cEZtRR07dkzTpk3T8uXLXV0KUG3l5ORo+fLlSklJsftQ0r17dw0dOrTC9/kBAHfgzZ/XPK9iwAGPP/64li5dqmeeeUZHjhzRuXPndO7cOR05ckRz5szR0qVLNXnyZFeX6VFyc3O1cuVKV5cBVFtpaWlq2bKlFi5cqODgYPXo0UM9evRQcHCwFi5cqNatW+vzzz93dZkAUGHe/HmN0wjh1cLDw7Vy5UrFx8eX2Z6UlKR//etfHnexpZnWrVv3h+1HjhzR+PHjVVJSUkUVAbhUt27d1KFDBy1evFgWi8WuzTAMPfzww9q/f79SUlJcVCEAOMabP69xGiG82tmzZxUZGVlue0REhM6dO1eFFbm/AQMGyGKx6I/+DvP7D3gAqs6XX36pFStWlLkfWiwWjR07Vh07dnRBZQDgHG/+vMZphPBqsbGxmjBhgnJyckq15eTkaNKkSYqNja36wtxYRESE3nvvPVmt1jK/vvjiC1eXCFRr4eHhSk1NLbc9NTVVYWFhVVgRAFweb/68xpEteLXFixerb9++ioiIULt27WwfQLKzs3XgwAFFR0drw4YNLq7SvXTq1Enp6enq379/me1/dtQLgLkmTJighIQEpaenq1evXnY/15KTk7Vs2TLNmzfPxVUCQMV58+c1rtmC17NarUpKStLu3bvtVu2KiYlR7969PXJlGzPt3LlT586dU58+fcpsP3funD7//HPddNNNVVwZgIvWrFmjBQsWKD093Xb9pK+vrzp16qRx48Zp4MCBLq4QABzjrZ/XCFsAAHiooqIi22k3DRs29Oh70QCAN/LMiAgAAFSzZk1FREQoIiKCoAXAI+3fv19Wq7XC/TMyMlRcXGxiRZWLI1sAAAAAXMLX11dZWVkVvhl7UFCQ9u3bp6ioKJMrqxwskAEAAADAJQzD0JQpUxQYGFih/oWFhSZXVLkIWwAAAABcokePHjp8+HCF+8fExCggIMDEiioXpxECAAAAgAlYIAPVXs+ePTVr1iwVFBS4uhSPsWPHDuXl5bm6DADlYB8FAPdA2EK116RJEyUnJ6t169auLsVjxMbGKioqSvPnz3d1KQDKwD4KAO6Ba7ZQ7a1YsUKSlJ+f79pCPEhmZqaOHDmiTZs2uboUAGVgHwUA98A1WwAAAABgAo5swevl5ORo+fLlSklJUVZWliQpPDxc3bt319ChQyt8X4fqpri4WBkZGXZjFh0dzY1TATfBPgoA7o8jW/BqaWlpio+PV2BgoOLi4hQWFiZJys7OVnJysgoKCpSUlKTOnTu7uFL3YbVaNXXqVC1atKjUBfbBwcEaOXKkZsyYIR8fLvkEXIF9FAA8B0e24NVGjRqlO++8U4sXL5bFYrFrMwxDDz/8sEaNGqWUlBQXVeh+Hn/8ca1YsULPPPOM4uPj7QLq5s2bNWXKFBUWFmrOnDkurhSonthHAcBzcGQLXi0gIEB79+4td6XBQ4cOqWPHjvr111+ruDL3FR4erpUrVyo+Pr7M9qSkJP3rX/9SdnZ2FVcGQGIfBQBPwjkG8Grh4eFKTU0ttz01NdX2V2H85uzZs4qMjCy3PSIiQufOnavCigBcin0UADwHR7bg1RYtWqTx48froYceUq9evUpds7Vs2TLNmzdPjzzyiIsrdR/9+vVTcXGxVq1apYYNG9q15eTk6N5775Wvr682bNjgogqB6o19FAA8B2ELXm/NmjVasGCB0tPTVVJSIkny9fVVp06dNG7cOA0cONDFFbqXY8eOqW/fvjp06JDatWtnF1APHDig6OhobdiwQY0bN3ZxpUD1xD4KAJ6DsIVqo6ioSDk5OZKkhg0bsjzyH7BarUpKStLu3bvtlpWOiYlR7969WeUMcDH2UQDwDIQtAAAAADABf/oCYLN//35ZrdYK98/IyFBxcbGJFQEAAHgujmwBsPH19VVWVpYaNWpUof5BQUHat2+foqKiTK4MAADA83BTYwA2hmFoypQpCgwMrFD/wsJCkysCAADwXIQtADY9evTQ4cOHK9w/JiZGAQEBJlYEAADguTiNENXejh071KFDBwUHB7u6FAAAAHgRFshAtRcbG6uoqCjNnz/f1aUAQKXo2bOnZs2apYKCAleXAgDVGmEL1V5mZqbWrl2r7OxsV5cCAJWiSZMmSk5OVuvWrV1dCgBUa5xGCACAl8rPz1dQUJCrywCAaouwhWopOztbFy5cUJMmTVxdCgBUCsMwZLFYXF0GAOASnEYIr3b27Fndc889atq0qYYMGaLCwkKNGDFCERERat68uW666Sbl5+e7ukwAuGx+fn46ePCgq8sAAFyCpd/h1f79738rPT1dEyZM0HvvvaeBAwfq+++/186dO1VSUqLhw4drzpw5euqpp1xdKgBUyLhx48rcXlJSomeeeUYNGjSQJD333HNVWRYAoAycRgiv1qRJE61cuVI333yzTpw4oSuvvFLr1q3TrbfeKknauHGjxo8fr0OHDrm4UgCoGB8fH3Xo0EEhISF22z/55BN17txZtWvXlsVi0datW11TIADAhrAFr+bv769vv/1WjRs3liTVrl1be/fuVcuWLSVJP/74o6Kjo3Xu3DlXlgkAFfbMM89o6dKlevXVV9WzZ0/b9po1a+rLL79UdHS0C6sDAFyKa7bg1Ro0aKBTp07ZHvfv39/ur8G//PKL/Pz8XFAZADjn8ccf15o1azR8+HBNmDBBRUVFri4JAFAOwha8Wvv27ZWWlmZ7vHr1aoWGhtoep6WlqU2bNq4oDQCcdv311ys9PV2nTp1S586d9dVXX7ESIQC4IU4jhFfLzc2Vj49PqWsbLtq0aZMCAgIUGxtbpXUBQGV5++23NWbMGJ06dUoHDhzgNEIAcCOELQAAPNxPP/2k9PR0xcXFqXbt2q4uBwDw/xG24LX279+vtm3bysenYmfLZmRkqFWrVqpRgzsiAAAA4PIRtuC1fH19lZWVpUaNGlWof1BQkPbt26eoqCiTKwMAAEB1wJ/w4bUMw9CUKVMUGBhYof6FhYUmVwQAAIDqhLAFr9WjRw8dPny4wv1jYmIUEBBgYkUAAACoTjiNEAAAAABMwH22AADwMjt27FBeXp6rywCAao+wBQCAl4mNjVVUVJTmz5/v6lIAoFojbAEA4GUyMzO1du1aZWdnu7oUAKjWuGYLAAAAAEzAkS0AALxAdna2jh496uoyAACXIGwBAOBBzp49q3vuuUdNmzbVkCFDVFhYqBEjRigiIkLNmzfXTTfdpPz8fFeXCQAQYQsAAI/y73//W+np6ZowYYKOHj2qgQMHaseOHdq5c6e2bdumnJwczZkzx9VlAgDENVsAAHiUJk2aaOXKlbr55pt14sQJXXnllVq3bp1uvfVWSdLGjRs1fvx4HTp0yMWVAgA4sgUAgAc5efKkrr76aklSZGSkAgIC1LJlS1t727ZtdezYMVeVBwC4BGELAAAP0qBBA506dcr2uH///goJCbE9/uWXX+Tn5+eCygAAv0fYAgDAg7Rv315paWm2x6tXr1ZoaKjtcVpamtq0aeOK0gAAv8M1WwAAeJDc3Fz5+PjYHc261KZNmxQQEKDY2NgqrQsAUBphCwAAAABMwGmEAAB4iP3798tqtVa4f0ZGhoqLi02sCADwRziyBQCAh/D19VVWVpYaNWpUof5BQUHat2+foqKiTK4MAFCWGq4uAAAAVIxhGJoyZYoCAwMr1L+wsNDkigAAf4SwBQCAh+jRo4cOHz5c4f4xMTEKCAgwsSIAwB/hNEIAAAAAMAELZAAAAACACQhbAAAAAGACwhYAAAAAmICwBQAAAAAmIGwBAAAAgAkIWwAAjzZ06FBZLBY988wzdts/+OADWSwW2+Pt27fLYrGU+ZWVlSVJmj59um2br6+vGjdurISEBOXm5pb7+s2aNSv3eS0Wi4YOHWrK+wYAuD/uswUA8Hj+/v6aM2eOHnroIdWrV+8P+x4+fFhBQUF220JDQ23/vuaaa/Txxx+rpKREBw8e1P3336+8vDytWbOmzOdLS0tTSUmJJGnXrl2644477F6D+1wBQPXFkS0AgMeLi4tTeHi4Zs+e/ad9Q0NDFR4ebvfl4/N/vw5r1Kih8PBwXXHFFYqLi9Odd96pLVu2lPt8jRo1sj1P/fr17V6jZs2aevjhh3XFFVcoMDBQ7dq101tvvWX3/WfPntXgwYNVu3ZtRUREaMGCBYqNjdWYMWOcGwwAgNsgbAEAPJ6vr6+efvppvfjii/rpp58q7Xl/+OEHJSUlqVatWk59//nz59WpUydt3LhRX331lRISEnTvvfcqNTXV1mfcuHH67LPPtG7dOm3ZskU7d+7UF198UVlvAQDgQpxGCADwCrfffruuvfZaTZs2Ta+99lq5/a688kq7x02bNlVGRobt8YEDB1SnTh2VlJTo/PnzkqTnnnvOqZquuOIKTZgwwfZ41KhRSkpK0jvvvKMuXbro7NmzWrlypVavXq1evXpJkl5//XVFRkY69XoAAPdC2AIAeI05c+aoZ8+edgHn93bu3Km6devaHtesWdOuvVWrVlq3bp3Onz+vN998U/v27dOoUaOcqqekpERPP/203nnnHR0/flyFhYW6cOGCAgMDJUlHjhxRUVGRunTpYvue4OBgtWrVyqnXAwC4F04jBAB4jR49eig+Pl6TJ08ut0/z5s119dVX276aNm1q116rVi1dffXVatu2rZ555hn5+vpqxowZTtXz7LPP6oUXXtCkSZO0bds27du3T/Hx8SosLHTq+QAAnoWwBQDwKs8884zWr1+vlJSUSnm+J554QvPmzdOJEycc/t7PPvtM/fv31z333KMOHTooKipK33zzja09KipKNWvWVFpamm1bXl6eXR8AgOcibAEAvEq7du00ePBgLVy4sMz2kydPKisry+6rqKio3OeLiYlR+/bt9fTTTztcS4sWLbRlyxbt2rVLBw8e1EMPPaTs7Gxbe926dTVkyBA99thj2rZtmzIyMjRs2DD5+PjY3SMMAOCZCFsAAK8zc+ZMWa3WMttatWqliIgIu6/09PQ/fL6xY8fq1Vdf1bFjxxyq44knntB1112n+Ph4xcbGKjw8XAMGDLDr89xzzykmJka33nqr4uLidMMNN6hNmzby9/d36LUAAO7HYhiG4eoiAADAb86dO6crrrhC8+fP17Bhw1xdDgDgMrAaIQAALrR3714dOnRIXbp0UV5enmbOnClJ6t+/v4srAwBcLsIWAAAuNm/ePB0+fFi1atVSp06dtHPnTjVs2NDVZQEALhOnEQIAAACACVggAwAAAABMQNgCAAAAABMQtgAAAADABIQtAAAAADABYQsAAAAATEDYAgAAAAATELYAAAAAwASELQAAAAAwAWELAAAAAEzw/wCDuNlgk2CGfgAAAABJRU5ErkJggg==\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "def export_to_file(export_file_path, data):\n", + " with open(export_file_path, \"w\") as f:\n", + " for record in data:\n", + " ner_tags = record[\"ner_tags\"]\n", + " tokens = record[\"tokens\"]\n", + " if len(tokens) > 0:\n", + " f.write(\n", + " str(len(tokens))\n", + " + \"\\t\"\n", + " + \"\\t\".join(tokens)\n", + " + \"\\t\"\n", + " + \"\\t\".join(map(str, ner_tags))\n", + " + \"\\n\"\n", + " )\n", + "\n", + "\n", + "os.makedirs(\"data\", exist_ok=True)\n", + "export_to_file(\"./data/conll_train.txt\", conll_data[\"train\"])\n", + "export_to_file(\"./data/conll_val.txt\", conll_data[\"validation\"])" + ], + "metadata": { + "id": "EQgmkV1fZRhI" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def make_tag_lookup_table():\n", + " iob_labels = [\"B\", \"I\"]\n", + " ner_labels = [\"PER\", \"ORG\", \"LOC\", \"MISC\"]\n", + " all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]\n", + " all_labels = [\"-\".join([a, b]) for a, b in all_labels]\n", + " all_labels = [\"[PAD]\", \"O\"] + all_labels\n", + " return dict(zip(range(0, len(all_labels) + 1), all_labels))\n", + "\n", + "\n", + "mapping = make_tag_lookup_table()\n", + "print(mapping)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OdufhIrEZRs2", + "outputId": "09e10fc1-6fdf-4281-ac81-973d32dad3a5" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "all_tokens = sum(conll_data[\"train\"][\"tokens\"], [])\n", + "all_tokens_array = np.array(list(map(str.lower, all_tokens)))\n", + "\n", + "counter = Counter(all_tokens_array)\n", + "print(len(counter))\n", + "\n", + "num_tags = len(mapping)\n", + "vocab_size = 20000\n", + "\n", + "# We only take (vocab_size - 2) most commons words from the training data since\n", + "# the `StringLookup` class uses 2 additional tokens - one denoting an unknown\n", + "# token and another one denoting a masking token\n", + "vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]\n", + "\n", + "# The StringLook class will convert tokens to token IDs\n", + "lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a7T9RCZ3ZSKB", + "outputId": "c2dae2fc-b812-4d64-b3eb-23e2d38710c3" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "21009\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "train_data = tf.data.TextLineDataset(\"./data/conll_train.txt\")\n", + "val_data = tf.data.TextLineDataset(\"./data/conll_val.txt\")" + ], + "metadata": { + "id": "vdcDo5IJZfjl" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(list(train_data.take(1).as_numpy_iterator()))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8fXqLG3FZfmx", + "outputId": "42354174-a397-4b9e-eda0-4b1d5ed62665" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[b'9\\tEU\\trejects\\tGerman\\tcall\\tto\\tboycott\\tBritish\\tlamb\\t.\\t3\\t0\\t7\\t0\\t0\\t0\\t7\\t0\\t0']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def map_record_to_training_data(record):\n", + " record = tf.strings.split(record, sep=\"\\t\")\n", + " length = tf.strings.to_number(record[0], out_type=tf.int32)\n", + " tokens = record[1 : length + 1]\n", + " tags = record[length + 1 :]\n", + " tags = tf.strings.to_number(tags, out_type=tf.int64)\n", + " tags += 1\n", + " return tokens, tags\n", + "\n", + "\n", + "def lowercase_and_convert_to_ids(tokens):\n", + " tokens = tf.strings.lower(tokens)\n", + " return lookup_layer(tokens)\n", + "\n", + "\n", + "# We use `padded_batch` here because each record in the dataset has a\n", + "# different length.\n", + "batch_size = 32\n", + "train_dataset = (\n", + " train_data.map(map_record_to_training_data)\n", + " .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))\n", + " .padded_batch(batch_size)\n", + ")\n", + "val_dataset = (\n", + " val_data.map(map_record_to_training_data)\n", + " .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))\n", + " .padded_batch(batch_size)\n", + ")\n", + "\n", + "ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)" + ], + "metadata": { + "id": "jtt-G6ezZto5" + }, + "execution_count": 14, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class CustomNonPaddingTokenLoss(keras.losses.Loss):\n", + " def __init__(self, name=\"custom_ner_loss\"):\n", + " super().__init__(name=name)\n", + "\n", + " def call(self, y_true, y_pred):\n", + " loss_fn = keras.losses.SparseCategoricalCrossentropy(\n", + " from_logits=False, reduction= 'none'\n", + " )\n", + " loss = loss_fn(y_true, y_pred)\n", + " mask = tf.cast((y_true > 0), dtype=tf.float32)\n", + " loss = loss * mask\n", + " return tf.reduce_sum(loss) / tf.reduce_sum(mask)\n", + "\n", + "\n", + "loss = CustomNonPaddingTokenLoss()" + ], + "metadata": { + "id": "uqCmpwqgZtrs" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ner_model.compile(optimizer=\"adam\", loss=loss)\n", + "ner_model.fit(train_dataset, epochs=10)\n", + "\n", + "\n", + "def tokenize_and_convert_to_ids(text):\n", + " tokens = text.split()\n", + " return lowercase_and_convert_to_ids(tokens)\n", + "\n", + "\n", + "# Sample inference using the trained model\n", + "sample_input = tokenize_and_convert_to_ids(\n", + " \"eu rejects german call to boycott british lamb\"\n", + ")\n", + "sample_input = tf.reshape(sample_input, shape=[1, -1])\n", + "print(sample_input)\n", + "\n", + "output = ner_model.predict(sample_input)\n", + "prediction = np.argmax(output, axis=-1)[0]\n", + "prediction = [mapping[i] for i in prediction]\n", + "\n", + "# eu -> B-ORG, german -> B-MISC, british -> B-MISC\n", + "print(prediction)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TQDGyN4gZtuC", + "outputId": "5b743bb3-2112-47b2-e4f7-0db45991f93d" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1/10\n", + "439/439 [==============================] - 20s 38ms/step - loss: 0.6150\n", + "Epoch 2/10\n", + "439/439 [==============================] - 17s 38ms/step - loss: 0.2667\n", + "Epoch 3/10\n", + "439/439 [==============================] - 14s 33ms/step - loss: 0.1617\n", + "Epoch 4/10\n", + "439/439 [==============================] - 15s 33ms/step - loss: 0.1254\n", + "Epoch 5/10\n", + "439/439 [==============================] - 14s 33ms/step - loss: 0.1015\n", + "Epoch 6/10\n", + "439/439 [==============================] - 14s 32ms/step - loss: 0.0837\n", + "Epoch 7/10\n", + "439/439 [==============================] - 15s 35ms/step - loss: 0.0697\n", + "Epoch 8/10\n", + "439/439 [==============================] - 14s 32ms/step - loss: 0.0604\n", + "Epoch 9/10\n", + "439/439 [==============================] - 15s 33ms/step - loss: 0.0526\n", + "Epoch 10/10\n", + "439/439 [==============================] - 16s 35ms/step - loss: 0.0456\n", + "tf.Tensor([[ 988 10950 204 628 6 3938 215 5773]], shape=(1, 8), dtype=int64)\n", + "1/1 [==============================] - 0s 261ms/step\n", + "['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def calculate_metrics(dataset):\n", + " all_true_tag_ids, all_predicted_tag_ids = [], []\n", + "\n", + " for x, y in dataset:\n", + " output = ner_model.predict(x, verbose=0)\n", + " predictions = np.argmax(output, axis=-1)\n", + " predictions = np.reshape(predictions, [-1])\n", + "\n", + " true_tag_ids = np.reshape(y, [-1])\n", + "\n", + " mask = (true_tag_ids > 0) & (predictions > 0)\n", + " true_tag_ids = true_tag_ids[mask]\n", + " predicted_tag_ids = predictions[mask]\n", + "\n", + " all_true_tag_ids.append(true_tag_ids)\n", + " all_predicted_tag_ids.append(predicted_tag_ids)\n", + "\n", + " all_true_tag_ids = np.concatenate(all_true_tag_ids)\n", + " all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)\n", + "\n", + " predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]\n", + " real_tags = [mapping[tag] for tag in all_true_tag_ids]\n", + "\n", + " evaluate(real_tags, predicted_tags)\n", + "\n", + "\n", + "calculate_metrics(val_dataset)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vPPszQFIcEKi", + "outputId": "22d8a103-b1d1-402b-b401-f5662fdaca00" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "processed 51362 tokens with 5942 phrases; found: 5194 phrases; correct: 3847.\n", + "accuracy: 62.20%; (non-O)\n", + "accuracy: 93.33%; precision: 74.07%; recall: 64.74%; FB1: 69.09\n", + " LOC: precision: 85.18%; recall: 79.48%; FB1: 82.23 1714\n", + " MISC: precision: 75.61%; recall: 63.88%; FB1: 69.25 779\n", + " ORG: precision: 63.88%; recall: 60.92%; FB1: 62.37 1279\n", + " PER: precision: 68.99%; recall: 53.26%; FB1: 60.11 1422\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def test_model_with_input(ner_model, mapping):\n", + " # Get input sentence from user\n", + " input_sentence = input(\"Enter a sentence: \")\n", + "\n", + " # Tokenize and convert input sentence to IDs\n", + " sample_input = tokenize_and_convert_to_ids(input_sentence)\n", + " sample_input = tf.reshape(sample_input, shape=[1, -1])\n", + "\n", + " # Predict tags using the trained model\n", + " output = ner_model.predict(sample_input)\n", + " predictions = np.argmax(output, axis=-1)[0]\n", + " predicted_tags = [mapping[i] for i in predictions]\n", + "\n", + " # Print the predicted tags for each token in the input sentence\n", + " print(\"Input sentence:\", input_sentence)\n", + " print(\"Predicted tags:\", predicted_tags)\n", + "\n", + "# Test the model with user input\n", + "test_model_with_input(ner_model, mapping)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BX6jui33cEiJ", + "outputId": "91207f20-c00e-46ab-ae91-9bc1dfc8d804" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Enter a sentence: My Name is Karishma. I live in Canada. Canada I am from India\n", + "1/1 [==============================] - 0s 20ms/step\n", + "Input sentence: My Name is Karishma. I live in Canada. Canada I am from India\n", + "Predicted tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOC']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "logger = logging.getLogger(\"presidio-analyzer\")\n", + "\n", + "\n", + "class FlairRecognizer(EntityRecognizer):\n", + " \"\"\"\n", + " Wrapper for a flair model, if needed to be used within Presidio Analyzer.\n", + " :example:\n", + " >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry\n", + " >flair_recognizer = FlairRecognizer()\n", + " >registry = RecognizerRegistry()\n", + " >registry.add_recognizer(flair_recognizer)\n", + " >analyzer = AnalyzerEngine(registry=registry)\n", + " >results = analyzer.analyze(\n", + " > \"My name is Christopher and I live in Irbid.\",\n", + " > language=\"en\",\n", + " > return_decision_process=True,\n", + " >)\n", + " >for result in results:\n", + " > print(result)\n", + " > print(result.analysis_explanation)\n", + " \"\"\"\n", + "\n", + " ENTITIES = [\n", + " \"LOCATION\",\n", + " \"PERSON\",\n", + " \"ORGANIZATION\",\n", + " # \"MISCELLANEOUS\" # - There are no direct correlation with Presidio entities.\n", + " ]\n", + "\n", + " DEFAULT_EXPLANATION = \"Identified as {} by Flair's Named Entity Recognition\"\n", + "\n", + " CHECK_LABEL_GROUPS = [\n", + " ({\"LOCATION\"}, {\"LOC\", \"LOCATION\"}),\n", + " ({\"PERSON\"}, {\"PER\", \"PERSON\"}),\n", + " ({\"ORGANIZATION\"}, {\"ORG\"}),\n", + " # ({\"MISCELLANEOUS\"}, {\"MISC\"}), # Probably not PII\n", + " ]\n", + "\n", + " MODEL_LANGUAGES = {\"en\": \"flair/ner-english-large\"}\n", + "\n", + " PRESIDIO_EQUIVALENCES = {\n", + " \"PER\": \"PERSON\",\n", + " \"LOC\": \"LOCATION\",\n", + " \"ORG\": \"ORGANIZATION\",\n", + " # 'MISC': 'MISCELLANEOUS' # - Probably not PII\n", + " }\n", + "\n", + " def __init__(\n", + " self,\n", + " supported_language: str = \"en\",\n", + " supported_entities: Optional[List[str]] = None,\n", + " check_label_groups: Optional[Tuple[Set, Set]] = None,\n", + " model: SequenceTagger = None,\n", + " model_path: Optional[str] = None,\n", + " ):\n", + " self.check_label_groups = (\n", + " check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS\n", + " )\n", + "\n", + " supported_entities = supported_entities if supported_entities else self.ENTITIES\n", + "\n", + " if model and model_path:\n", + " raise ValueError(\"Only one of model or model_path should be provided.\")\n", + " elif model and not model_path:\n", + " self.model = model\n", + " elif not model and model_path:\n", + " print(f\"Loading model from {model_path}\")\n", + " self.model = SequenceTagger.load(model_path)\n", + " else:\n", + " print(f\"Loading model for language {supported_language}\")\n", + " self.model = SequenceTagger.load(\n", + " self.MODEL_LANGUAGES.get(supported_language)\n", + " )\n", + "\n", + " super().__init__(\n", + " supported_entities=supported_entities,\n", + " supported_language=supported_language,\n", + " name=\"Flair Analytics\",\n", + " )\n", + "\n", + " def load(self) -> None:\n", + " \"\"\"Load the model, not used. Model is loaded during initialization.\"\"\"\n", + " pass\n", + "\n", + " def get_supported_entities(self) -> List[str]:\n", + " \"\"\"\n", + " Return supported entities by this model.\n", + " :return: List of the supported entities.\n", + " \"\"\"\n", + " return self.supported_entities\n", + "\n", + " # Class to use Flair with Presidio as an external recognizer.\n", + " def analyze(\n", + " self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None\n", + " ) -> List[RecognizerResult]:\n", + " \"\"\"\n", + " Analyze text using Text Analytics.\n", + " :param text: The text for analysis.\n", + " :param entities: Not working properly for this recognizer.\n", + " :param nlp_artifacts: Not used by this recognizer.\n", + " :param language: Text language. Supported languages in MODEL_LANGUAGES\n", + " :return: The list of Presidio RecognizerResult constructed from the recognized\n", + " Flair detections.\n", + " \"\"\"\n", + "\n", + " results = []\n", + "\n", + " sentences = Sentence(text)\n", + " self.model.predict(sentences)\n", + "\n", + " # If there are no specific list of entities, we will look for all of it.\n", + " if not entities:\n", + " entities = self.supported_entities\n", + "\n", + " for entity in entities:\n", + " if entity not in self.supported_entities:\n", + " continue\n", + "\n", + " for ent in sentences.get_spans(\"ner\"):\n", + " if not self.__check_label(\n", + " entity, ent.labels[0].value, self.check_label_groups\n", + " ):\n", + " continue\n", + " textual_explanation = self.DEFAULT_EXPLANATION.format(\n", + " ent.labels[0].value\n", + " )\n", + " explanation = self.build_flair_explanation(\n", + " round(ent.score, 2), textual_explanation\n", + " )\n", + " flair_result = self._convert_to_recognizer_result(ent, explanation)\n", + "\n", + " results.append(flair_result)\n", + "\n", + " return results\n", + "\n", + " def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:\n", + " entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)\n", + " flair_score = round(entity.score, 2)\n", + "\n", + " flair_results = RecognizerResult(\n", + " entity_type=entity_type,\n", + " start=entity.start_position,\n", + " end=entity.end_position,\n", + " score=flair_score,\n", + " analysis_explanation=explanation,\n", + " )\n", + "\n", + " return flair_results\n", + "\n", + " def build_flair_explanation(\n", + " self, original_score: float, explanation: str\n", + " ) -> AnalysisExplanation:\n", + " \"\"\"\n", + " Create explanation for why this result was detected.\n", + " :param original_score: Score given by this recognizer\n", + " :param explanation: Explanation string\n", + " :return:\n", + " \"\"\"\n", + " explanation = AnalysisExplanation(\n", + " recognizer=self.__class__.__name__,\n", + " original_score=original_score,\n", + " textual_explanation=explanation,\n", + " )\n", + " return explanation\n", + "\n", + " @staticmethod\n", + " def __check_label(\n", + " entity: str, label: str, check_label_groups: Tuple[Set, Set]\n", + " ) -> bool:\n", + " return any(\n", + " [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]\n", + " )" + ], + "metadata": { + "id": "OWwGi143lCVF" + }, + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers import AutoModel, AutoTokenizer\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " from flair.data import Sentence\n", + " from flair.models import SequenceTagger\n", + "\n", + " # load tagger\n", + " tagger = SequenceTagger.load(\"flair/ner-english-large\")\n", + "\n", + " # make example sentence\n", + " sentence = Sentence(\"My name is Karishma Shirsath. I live in Toronto Canada.\")\n", + "\n", + " # predict NER tags\n", + " tagger.predict(sentence)\n", + "\n", + " # print sentence\n", + " print(sentence)\n", + "\n", + " # print predicted NER spans\n", + " print(\"The following NER tags are found:\")\n", + " # iterate over entities and print\n", + " for entity in sentence.get_spans(\"ner\"):\n", + " print(entity)\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LT92Kk44lgAV", + "outputId": "0fc28bdc-4a3a-4e68-8617-27cdcedbc3ce" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2024-03-16 05:24:49,993 SequenceTagger predicts: Dictionary with 20 tags: , O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, , \n", + "Sentence[12]: \"My name is Karishma Shirsath. I live in Toronto Canada.\" → [\"Karishma Shirsath\"/PER, \"Toronto\"/LOC, \"Canada\"/LOC]\n", + "The following NER tags are found:\n", + "Span[3:5]: \"Karishma Shirsath\" → PER (1.0)\n", + "Span[9:10]: \"Toronto\" → LOC (1.0)\n", + "Span[10:11]: \"Canada\" → LOC (1.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "if __name__ == \"__main__\":\n", + " from flair.data import Sentence\n", + " from flair.models import SequenceTagger\n", + "\n", + " # load tagger\n", + " tagger = SequenceTagger.load(\"flair/ner-english-large\")\n", + "\n", + " # make example sentence\n", + " sentence = Sentence(\"My name is Karishma Shirsath. I live in Toronto Canada.\")\n", + "\n", + " # predict NER tags\n", + " tagger.predict(sentence)\n", + "\n", + " # print sentence\n", + " print(sentence)\n", + "\n", + " # Anonymize identified named entities\n", + " anonymized_sentence = str(sentence)\n", + " for entity in sentence.get_spans(\"ner\"):\n", + " entity_text = entity.text\n", + " anonymized_text = \"*\" * len(entity_text)\n", + " anonymized_sentence = anonymized_sentence.replace(entity_text, anonymized_text)\n", + "\n", + " # print anonymized sentence\n", + " print(\"Anonymized sentence:\")\n", + " print(anonymized_sentence)\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lgYJJVilwbVF", + "outputId": "20e52cfd-0e6e-4906-bcb0-3c403160293d" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2024-03-16 05:39:00,757 SequenceTagger predicts: Dictionary with 20 tags: , O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, , \n", + "Sentence[12]: \"My name is Karishma Shirsath. I live in Toronto Canada.\" → [\"Karishma Shirsath\"/PER, \"Toronto\"/LOC, \"Canada\"/LOC]\n", + "Anonymized sentence:\n", + "Sentence[12]: \"My name is *****************. I live in ******* ******.\" → [\"*****************\"/PER, \"*******\"/LOC, \"******\"/LOC]\n" + ] + } + ] + } + ] +} \ No newline at end of file