Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
DeepMoji xVASynth Plugin
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- app.py +2 -1
- resources/app/plugins.txt +1 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/.gitignore +108 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/.travis.yml +27 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/LICENSE +21 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/README.md +90 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/data/.gitkeep +1 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/data/emoji_codes.json +67 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/INSTALLER +1 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/LICENSE.txt +28 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/METADATA +182 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/RECORD +25 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/REQUESTED +0 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/WHEEL +6 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/top_level.txt +1 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/zip-safe +1 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/__init__.py +62 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/__init__.pyi +37 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/core.py +372 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/core.pyi +47 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/py.typed +0 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/tokenizer.py +361 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/tokenizer.pyi +47 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/__init__.py +36 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/__init__.pyi +6 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/data_dict.py +0 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/data_dict.pyi +7 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/py.typed +0 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/.gitkeep +1 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/README.md +39 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/__init__.py +0 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/create_twitter_vocab.py +13 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/dataset_split.py +59 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/encode_texts.py +41 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/example_helper.py +6 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/finetune_insults_chain-thaw.py +44 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/finetune_semeval_class-avg_f1.py +50 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/finetune_youtube_last.py +35 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/score_texts_emojis.py +85 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/text_emojize.py +63 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/tokenize_dataset.py +26 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/examples/vocab_extension.py +30 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/analyze_all_results.py +40 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/analyze_results.py +39 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/calculate_coverages.py +90 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/convert_all_datasets.py +110 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/download_weights.py +65 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/finetune_dataset.py +109 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/results/.gitkeep +1 -0
- resources/app/plugins/deepmoji_plugin/DeepMoji/setup.py +16 -0
app.py
CHANGED
@@ -115,7 +115,6 @@ def run_xvaserver():
|
|
115 |
|
116 |
# load default model
|
117 |
load_model(voice_models[0])
|
118 |
-
current_voice_model = voice_models[0]
|
119 |
|
120 |
# Wait for the process to exit
|
121 |
xvaserver.wait()
|
@@ -145,6 +144,8 @@ def load_model(voice_model_name):
|
|
145 |
return
|
146 |
|
147 |
def predict(input_text, pacing, voice, lang):
|
|
|
|
|
148 |
|
149 |
# load voice model if not the current model
|
150 |
if (current_voice_model != voice):
|
|
|
115 |
|
116 |
# load default model
|
117 |
load_model(voice_models[0])
|
|
|
118 |
|
119 |
# Wait for the process to exit
|
120 |
xvaserver.wait()
|
|
|
144 |
return
|
145 |
|
146 |
def predict(input_text, pacing, voice, lang):
|
147 |
+
# grab only the first 1000 characters
|
148 |
+
input_text = input_text[:1000]
|
149 |
|
150 |
# load voice model if not the current model
|
151 |
if (current_voice_model != voice):
|
resources/app/plugins.txt
CHANGED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*deepmoji_plugin
|
resources/app/plugins/deepmoji_plugin/DeepMoji/.gitignore
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
env/
|
12 |
+
build/
|
13 |
+
develop-eggs/
|
14 |
+
dist/
|
15 |
+
downloads/
|
16 |
+
eggs/
|
17 |
+
.eggs/
|
18 |
+
lib/
|
19 |
+
lib64/
|
20 |
+
parts/
|
21 |
+
sdist/
|
22 |
+
var/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
|
27 |
+
# PyInstaller
|
28 |
+
# Usually these files are written by a python script from a template
|
29 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
30 |
+
*.manifest
|
31 |
+
*.spec
|
32 |
+
|
33 |
+
# Installer logs
|
34 |
+
pip-log.txt
|
35 |
+
pip-delete-this-directory.txt
|
36 |
+
|
37 |
+
# Unit test / coverage reports
|
38 |
+
htmlcov/
|
39 |
+
.tox/
|
40 |
+
.coverage
|
41 |
+
.coverage.*
|
42 |
+
.cache
|
43 |
+
nosetests.xml
|
44 |
+
coverage.xml
|
45 |
+
*,cover
|
46 |
+
.hypothesis/
|
47 |
+
|
48 |
+
# Translations
|
49 |
+
*.mo
|
50 |
+
*.pot
|
51 |
+
|
52 |
+
# Django stuff:
|
53 |
+
*.log
|
54 |
+
local_settings.py
|
55 |
+
|
56 |
+
# Flask stuff:
|
57 |
+
instance/
|
58 |
+
.webassets-cache
|
59 |
+
|
60 |
+
# Scrapy stuff:
|
61 |
+
.scrapy
|
62 |
+
|
63 |
+
# Sphinx documentation
|
64 |
+
docs/_build/
|
65 |
+
|
66 |
+
# PyBuilder
|
67 |
+
target/
|
68 |
+
|
69 |
+
# IPython Notebook
|
70 |
+
.ipynb_checkpoints
|
71 |
+
|
72 |
+
# pyenv
|
73 |
+
.python-version
|
74 |
+
|
75 |
+
# celery beat schedule file
|
76 |
+
celerybeat-schedule
|
77 |
+
|
78 |
+
# dotenv
|
79 |
+
.env
|
80 |
+
|
81 |
+
# virtualenv
|
82 |
+
venv/
|
83 |
+
ENV/
|
84 |
+
|
85 |
+
# Spyder project settings
|
86 |
+
.spyderproject
|
87 |
+
|
88 |
+
# Rope project settings
|
89 |
+
.ropeproject
|
90 |
+
|
91 |
+
# Local data
|
92 |
+
/data/local
|
93 |
+
|
94 |
+
# Vim swapfiles
|
95 |
+
*.swp
|
96 |
+
*.swo
|
97 |
+
|
98 |
+
# nosetests
|
99 |
+
.noseids
|
100 |
+
|
101 |
+
# pyTorch model
|
102 |
+
pytorch_model.bin
|
103 |
+
|
104 |
+
# VSCODE
|
105 |
+
.vscode/*
|
106 |
+
|
107 |
+
# data
|
108 |
+
*.csv
|
resources/app/plugins/deepmoji_plugin/DeepMoji/.travis.yml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: travis_latest
|
2 |
+
language: python
|
3 |
+
cache: pip
|
4 |
+
python:
|
5 |
+
- 2.7
|
6 |
+
- 3.6
|
7 |
+
#- nightly
|
8 |
+
#- pypy
|
9 |
+
#- pypy3
|
10 |
+
matrix:
|
11 |
+
allow_failures:
|
12 |
+
- python: nightly
|
13 |
+
- python: pypy
|
14 |
+
- python: pypy3
|
15 |
+
install:
|
16 |
+
#- pip install -r requirements.txt
|
17 |
+
- pip install flake8 # pytest # add another testing frameworks later
|
18 |
+
before_script:
|
19 |
+
# stop the build if there are Python syntax errors or undefined names
|
20 |
+
- flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
21 |
+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
22 |
+
- flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
23 |
+
script:
|
24 |
+
- true # pytest --capture=sys # add other tests here
|
25 |
+
notifications:
|
26 |
+
on_success: change
|
27 |
+
on_failure: change # `always` will be the setting once code changes slow down
|
resources/app/plugins/deepmoji_plugin/DeepMoji/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2017 Bjarke Felbo, Han Thi Nguyen, Thomas Wolf
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
resources/app/plugins/deepmoji_plugin/DeepMoji/README.md
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### ------ Update September 2018 ------
|
2 |
+
It's been a year since TorchMoji and DeepMoji were released. We're trying to understand how it's being used such that we can make improvements and design better models in the future.
|
3 |
+
|
4 |
+
You can help us achieve this by answering this [4-question Google Form](https://docs.google.com/forms/d/e/1FAIpQLSe1h4NSQD30YM8dsbJQEnki-02_9KVQD34qgP9to0bwAHBvBA/viewform "DeepMoji Google Form"). Thanks for your support!
|
5 |
+
|
6 |
+
# π TorchMoji
|
7 |
+
|
8 |
+
> **Read our blog post about the implementation process [here](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983).**
|
9 |
+
|
10 |
+
TorchMoji is a [pyTorch](http://pytorch.org/) implementation of the [DeepMoji](https://github.com/bfelbo/DeepMoji) model developped by Bjarke Felbo, Alan Mislove, Anders SΓΈgaard, Iyad Rahwan and Sune Lehmann.
|
11 |
+
|
12 |
+
This model trained on 1.2 billion tweets with emojis to understand how language is used to express emotions. Through transfer learning the model can obtain state-of-the-art performance on many emotion-related text modeling tasks.
|
13 |
+
|
14 |
+
Try the online demo of DeepMoji [http://deepmoji.mit.edu](http://deepmoji.mit.edu/)! See the [paper](https://arxiv.org/abs/1708.00524), [blog post](https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0) or [FAQ](https://www.media.mit.edu/projects/deepmoji/overview/) for more details.
|
15 |
+
|
16 |
+
## Overview
|
17 |
+
* [torchmoji/](torchmoji) contains all the underlying code needed to convert a dataset to the vocabulary and use the model.
|
18 |
+
* [examples/](examples) contains short code snippets showing how to convert a dataset to the vocabulary, load up the model and run it on that dataset.
|
19 |
+
* [scripts/](scripts) contains code for processing and analysing datasets to reproduce results in the paper.
|
20 |
+
* [model/](model) contains the pretrained model and vocabulary.
|
21 |
+
* [data/](data) contains raw and processed datasets that we include in this repository for testing.
|
22 |
+
* [tests/](tests) contains unit tests for the codebase.
|
23 |
+
|
24 |
+
To start out with, have a look inside the [examples/](examples) directory. See [score_texts_emojis.py](examples/score_texts_emojis.py) for how to use DeepMoji to extract emoji predictions, [encode_texts.py](examples/encode_texts.py) for how to convert text into 2304-dimensional emotional feature vectors or [finetune_youtube_last.py](examples/finetune_youtube_last.py) for how to use the model for transfer learning on a new dataset.
|
25 |
+
|
26 |
+
Please consider citing the [paper](https://arxiv.org/abs/1708.00524) of DeepMoji if you use the model or code (see below for citation).
|
27 |
+
|
28 |
+
## Installation
|
29 |
+
|
30 |
+
Assuming you have [Conda](https://conda.io) installed, run:
|
31 |
+
|
32 |
+
```bash
|
33 |
+
conda create -n torchMoji -f environment.yml
|
34 |
+
conda activate torchMoji
|
35 |
+
pip install -e .
|
36 |
+
```
|
37 |
+
|
38 |
+
This will install the following dependencies:
|
39 |
+
|
40 |
+
* [PyTorch](https://pytorch.org)
|
41 |
+
* [scikit-learn](https://github.com/scikit-learn/scikit-learn)
|
42 |
+
* [text-unidecode](https://github.com/kmike/text-unidecode)
|
43 |
+
* [emoji](https://github.com/carpedm20/emoji)
|
44 |
+
|
45 |
+
If you do not want to use Conda, please install `torch==1.3.1` from PIP and then run `pip install -e .` from the root directory (don't forget to set up a virtual environment).
|
46 |
+
|
47 |
+
At the present stage the model can't make efficient use of CUDA. See details in the [Hugging Face blog post](https://medium.com/huggingface/understanding-emotions-from-keras-to-pytorch-3ccb61d5a983).
|
48 |
+
|
49 |
+
Then, run the download script to downloads the pretrained torchMoji weights (~85MB) from [here](https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0) and put them in the model/ directory:
|
50 |
+
|
51 |
+
```bash
|
52 |
+
python scripts/download_weights.py
|
53 |
+
```
|
54 |
+
|
55 |
+
## Testing
|
56 |
+
To run the tests, install [nose](http://nose.readthedocs.io/en/latest/). After installing, navigate to the [tests/](tests) directory and run:
|
57 |
+
|
58 |
+
```bash
|
59 |
+
cd tests
|
60 |
+
nosetests -v
|
61 |
+
```
|
62 |
+
|
63 |
+
By default, this will also run finetuning tests. These tests train the model for one epoch and then check the resulting accuracy, which may take several minutes to finish. If you'd prefer to exclude those, run the following instead:
|
64 |
+
|
65 |
+
```bash
|
66 |
+
cd tests
|
67 |
+
nosetests -v -a '!slow'
|
68 |
+
```
|
69 |
+
|
70 |
+
## Disclaimer
|
71 |
+
This code has been tested to work with Python 2.7 and 3.5 on Ubuntu 16.04 and macOS Sierra machines. It has not been optimized for efficiency, but should be fast enough for most purposes. We do not give any guarantees that there are no bugs - use the code on your own responsibility!
|
72 |
+
|
73 |
+
## Contributions
|
74 |
+
We welcome pull requests if you feel like something could be improved. You can also greatly help us by telling us how you felt when writing your most recent tweets. Just click [here](http://deepmoji.mit.edu/contribute/) to contribute.
|
75 |
+
|
76 |
+
## License
|
77 |
+
This code and the pretrained model is licensed under the MIT license.
|
78 |
+
|
79 |
+
## Benchmark datasets
|
80 |
+
The benchmark datasets are uploaded to this repository for convenience purposes only. They were not released by us and we do not claim any rights on them. Use the datasets at your responsibility and make sure you fulfill the licenses that they were released with. If you use any of the benchmark datasets please consider citing the original authors.
|
81 |
+
|
82 |
+
## Citation
|
83 |
+
```
|
84 |
+
@inproceedings{felbo2017,
|
85 |
+
title={Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm},
|
86 |
+
author={Felbo, Bjarke and Mislove, Alan and S{\o}gaard, Anders and Rahwan, Iyad and Lehmann, Sune},
|
87 |
+
booktitle={Conference on Empirical Methods in Natural Language Processing (EMNLP)},
|
88 |
+
year={2017}
|
89 |
+
}
|
90 |
+
```
|
resources/app/plugins/deepmoji_plugin/DeepMoji/data/.gitkeep
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
resources/app/plugins/deepmoji_plugin/DeepMoji/data/emoji_codes.json
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"0": ":joy:",
|
3 |
+
"1": ":unamused:",
|
4 |
+
"2": ":weary:",
|
5 |
+
"3": ":sob:",
|
6 |
+
"4": ":heart_eyes:",
|
7 |
+
"5": ":pensive:",
|
8 |
+
"6": ":ok_hand:",
|
9 |
+
"7": ":blush:",
|
10 |
+
"8": ":heart:",
|
11 |
+
"9": ":smirk:",
|
12 |
+
"10":":grin:",
|
13 |
+
"11":":notes:",
|
14 |
+
"12":":flushed:",
|
15 |
+
"13":":100:",
|
16 |
+
"14":":sleeping:",
|
17 |
+
"15":":relieved:",
|
18 |
+
"16":":relaxed:",
|
19 |
+
"17":":raised_hands:",
|
20 |
+
"18":":two_hearts:",
|
21 |
+
"19":":expressionless:",
|
22 |
+
"20":":sweat_smile:",
|
23 |
+
"21":":pray:",
|
24 |
+
"22":":confused:",
|
25 |
+
"23":":kissing_heart:",
|
26 |
+
"24":":hearts:",
|
27 |
+
"25":":neutral_face:",
|
28 |
+
"26":":information_desk_person:",
|
29 |
+
"27":":disappointed:",
|
30 |
+
"28":":see_no_evil:",
|
31 |
+
"29":":tired_face:",
|
32 |
+
"30":":v:",
|
33 |
+
"31":":sunglasses:",
|
34 |
+
"32":":rage:",
|
35 |
+
"33":":thumbsup:",
|
36 |
+
"34":":cry:",
|
37 |
+
"35":":sleepy:",
|
38 |
+
"36":":stuck_out_tongue_winking_eye:",
|
39 |
+
"37":":triumph:",
|
40 |
+
"38":":raised_hand:",
|
41 |
+
"39":":mask:",
|
42 |
+
"40":":clap:",
|
43 |
+
"41":":eyes:",
|
44 |
+
"42":":gun:",
|
45 |
+
"43":":persevere:",
|
46 |
+
"44":":imp:",
|
47 |
+
"45":":sweat:",
|
48 |
+
"46":":broken_heart:",
|
49 |
+
"47":":blue_heart:",
|
50 |
+
"48":":headphones:",
|
51 |
+
"49":":speak_no_evil:",
|
52 |
+
"50":":wink:",
|
53 |
+
"51":":skull:",
|
54 |
+
"52":":confounded:",
|
55 |
+
"53":":smile:",
|
56 |
+
"54":":stuck_out_tongue_winking_eye:",
|
57 |
+
"55":":angry:",
|
58 |
+
"56":":no_good:",
|
59 |
+
"57":":muscle:",
|
60 |
+
"58":":punch:",
|
61 |
+
"59":":purple_heart:",
|
62 |
+
"60":":sparkling_heart:",
|
63 |
+
"61":":blue_heart:",
|
64 |
+
"62":":grimacing:",
|
65 |
+
"63":":sparkles:"
|
66 |
+
}
|
67 |
+
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/INSTALLER
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/LICENSE.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
New BSD License
|
2 |
+
|
3 |
+
Copyright (c) 2014-2023, Taehoon Kim, Kevin Wurster
|
4 |
+
All rights reserved.
|
5 |
+
|
6 |
+
Redistribution and use in source and binary forms, with or without
|
7 |
+
modification, are permitted provided that the following conditions are met:
|
8 |
+
|
9 |
+
* Redistributions of source code must retain the above copyright notice, this
|
10 |
+
list of conditions and the following disclaimer.
|
11 |
+
|
12 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
13 |
+
this list of conditions and the following disclaimer in the documentation
|
14 |
+
and/or other materials provided with the distribution.
|
15 |
+
|
16 |
+
* The names of its contributors may not be used to endorse or promote products
|
17 |
+
derived from this software without specific prior written permission.
|
18 |
+
|
19 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
20 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
21 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
22 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
23 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
24 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
25 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
26 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
27 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/METADATA
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: emoji
|
3 |
+
Version: 2.8.0
|
4 |
+
Summary: Emoji for Python
|
5 |
+
Home-page: https://github.com/carpedm20/emoji/
|
6 |
+
Author: Taehoon Kim, Kevin Wurster
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: New BSD
|
9 |
+
Keywords: emoji
|
10 |
+
Classifier: Development Status :: 5 - Production/Stable
|
11 |
+
Classifier: Intended Audience :: Developers
|
12 |
+
Classifier: Intended Audience :: Information Technology
|
13 |
+
Classifier: License :: OSI Approved :: BSD License
|
14 |
+
Classifier: Operating System :: OS Independent
|
15 |
+
Classifier: Programming Language :: Python :: 3
|
16 |
+
Classifier: Programming Language :: Python :: 3.6
|
17 |
+
Classifier: Programming Language :: Python :: 3.7
|
18 |
+
Classifier: Programming Language :: Python :: 3.8
|
19 |
+
Classifier: Programming Language :: Python :: 3.9
|
20 |
+
Classifier: Programming Language :: Python :: 3.10
|
21 |
+
Classifier: Programming Language :: Python :: 3.11
|
22 |
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
23 |
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
24 |
+
Classifier: Programming Language :: Python
|
25 |
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
26 |
+
Classifier: Topic :: Multimedia :: Graphics :: Presentation
|
27 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28 |
+
Classifier: Typing :: Typed
|
29 |
+
Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
|
30 |
+
License-File: LICENSE.txt
|
31 |
+
Provides-Extra: dev
|
32 |
+
Requires-Dist: pytest ; extra == 'dev'
|
33 |
+
Requires-Dist: coverage ; extra == 'dev'
|
34 |
+
Requires-Dist: coveralls ; extra == 'dev'
|
35 |
+
|
36 |
+
Emoji
|
37 |
+
=====
|
38 |
+
|
39 |
+
Emoji for Python. This project was inspired by `kyokomi <https://github.com/kyokomi/emoji>`__.
|
40 |
+
|
41 |
+
|
42 |
+
Example
|
43 |
+
-------
|
44 |
+
|
45 |
+
The entire set of Emoji codes as defined by the `Unicode consortium <https://unicode.org/emoji/charts/full-emoji-list.html>`__
|
46 |
+
is supported in addition to a bunch of `aliases <https://www.webfx.com/tools/emoji-cheat-sheet/>`__. By
|
47 |
+
default, only the official list is enabled but doing ``emoji.emojize(language='alias')`` enables
|
48 |
+
both the full list and aliases.
|
49 |
+
|
50 |
+
.. code-block:: python
|
51 |
+
|
52 |
+
>>> import emoji
|
53 |
+
>>> print(emoji.emojize('Python is :thumbs_up:'))
|
54 |
+
Python is π
|
55 |
+
>>> print(emoji.emojize('Python is :thumbsup:', language='alias'))
|
56 |
+
Python is π
|
57 |
+
>>> print(emoji.demojize('Python is π'))
|
58 |
+
Python is :thumbs_up:
|
59 |
+
>>> print(emoji.emojize("Python is fun :red_heart:"))
|
60 |
+
Python is fun β€
|
61 |
+
>>> print(emoji.emojize("Python is fun :red_heart:", variant="emoji_type"))
|
62 |
+
Python is fun β€οΈ #red heart, not black heart
|
63 |
+
>>> print(emoji.is_emoji("π"))
|
64 |
+
True
|
65 |
+
|
66 |
+
..
|
67 |
+
|
68 |
+
By default, the language is English (``language='en'``) but also supported languages are:
|
69 |
+
|
70 |
+
* Spanish (``'es'``)
|
71 |
+
* Portuguese (``'pt'``)
|
72 |
+
* Italian (``'it'``)
|
73 |
+
* French (``'fr'``)
|
74 |
+
* German (``'de'``)
|
75 |
+
* Farsi/Persian (``'fa'``)
|
76 |
+
* Indonesian (``'id'``)
|
77 |
+
* Simplified Chinese (``'zh'``)
|
78 |
+
* Japanese (``'ja'``)
|
79 |
+
* Korean (``'ko'``)
|
80 |
+
|
81 |
+
|
82 |
+
.. code-block:: python
|
83 |
+
|
84 |
+
>>> print(emoji.emojize('Python es :pulgar_hacia_arriba:', language='es'))
|
85 |
+
Python es π
|
86 |
+
>>> print(emoji.demojize('Python es π', language='es'))
|
87 |
+
Python es :pulgar_hacia_arriba:
|
88 |
+
>>> print(emoji.emojize("Python Γ© :polegar_para_cima:", language='pt'))
|
89 |
+
Python Γ© π
|
90 |
+
>>> print(emoji.demojize("Python Γ© π", language='pt'))
|
91 |
+
Python Γ© :polegar_para_cima:οΈ
|
92 |
+
|
93 |
+
..
|
94 |
+
|
95 |
+
Installation
|
96 |
+
------------
|
97 |
+
|
98 |
+
Via pip:
|
99 |
+
|
100 |
+
.. code-block:: console
|
101 |
+
|
102 |
+
$ python -m pip install emoji --upgrade
|
103 |
+
|
104 |
+
From master branch:
|
105 |
+
|
106 |
+
.. code-block:: console
|
107 |
+
|
108 |
+
$ git clone https://github.com/carpedm20/emoji.git
|
109 |
+
$ cd emoji
|
110 |
+
$ python -m pip install .
|
111 |
+
|
112 |
+
|
113 |
+
Developing
|
114 |
+
----------
|
115 |
+
|
116 |
+
.. code-block:: console
|
117 |
+
|
118 |
+
$ git clone https://github.com/carpedm20/emoji.git
|
119 |
+
$ cd emoji
|
120 |
+
$ python -m pip install -e .\[dev\]
|
121 |
+
$ pytest
|
122 |
+
$ coverage run -m pytest
|
123 |
+
$ coverage report
|
124 |
+
|
125 |
+
The ``utils/get_codes_from_unicode_emoji_data_files.py`` is used to generate
|
126 |
+
``unicode_codes/data_dict.py``. Generally speaking it scrapes a table on the
|
127 |
+
`Unicode Consortium's website <https://www.unicode.org/reports/tr51/#emoji_data>`__
|
128 |
+
with `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/>`__
|
129 |
+
and prints the contents to ``stdout`` as a Python dictionary. For more
|
130 |
+
information take a look in the `utils/README.md <utils/README.md>`__ file.
|
131 |
+
|
132 |
+
|
133 |
+
Links
|
134 |
+
-----
|
135 |
+
|
136 |
+
**Documentation**
|
137 |
+
|
138 |
+
`https://carpedm20.github.io/emoji/docs/ <https://carpedm20.github.io/emoji/docs/>`__
|
139 |
+
|
140 |
+
**Overview of all emoji:**
|
141 |
+
|
142 |
+
`https://carpedm20.github.io/emoji/ <https://carpedm20.github.io/emoji/>`__
|
143 |
+
|
144 |
+
(auto-generated list of the emoji that are supported by the current version of this package)
|
145 |
+
|
146 |
+
**For English:**
|
147 |
+
|
148 |
+
`Emoji Cheat Sheet <https://www.webfx.com/tools/emoji-cheat-sheet/>`__
|
149 |
+
|
150 |
+
`Official Unicode list <http://www.unicode.org/emoji/charts/full-emoji-list.html>`__
|
151 |
+
|
152 |
+
**For Spanish:**
|
153 |
+
|
154 |
+
`Unicode list <https://emojiterra.com/es/lista-es/>`__
|
155 |
+
|
156 |
+
**For Portuguese:**
|
157 |
+
|
158 |
+
`Unicode list <https://emojiterra.com/pt/lista/>`__
|
159 |
+
|
160 |
+
**For Italian:**
|
161 |
+
|
162 |
+
`Unicode list <https://emojiterra.com/it/lista-it/>`__
|
163 |
+
|
164 |
+
**For French:**
|
165 |
+
|
166 |
+
`Unicode list <https://emojiterra.com/fr/liste-fr/>`__
|
167 |
+
|
168 |
+
**For German:**
|
169 |
+
|
170 |
+
`Unicode list <https://emojiterra.com/de/liste/>`__
|
171 |
+
|
172 |
+
|
173 |
+
Authors
|
174 |
+
-------
|
175 |
+
|
176 |
+
Taehoon Kim / `@carpedm20 <http://carpedm20.github.io/about/>`__
|
177 |
+
|
178 |
+
Kevin Wurster / `@geowurster <http://twitter.com/geowurster/>`__
|
179 |
+
|
180 |
+
Maintainer
|
181 |
+
----------
|
182 |
+
Tahir Jalilov / `@TahirJalilov <https://github.com/TahirJalilov>`__
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/RECORD
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
emoji-2.8.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
2 |
+
emoji-2.8.0.dist-info/LICENSE.txt,sha256=4MY4T6OKBxBrTgAP-CcWXbBrXfB0AbsWoYWJIcg4B8w,1483
|
3 |
+
emoji-2.8.0.dist-info/METADATA,sha256=KN7KUqGmtWikpf_Y_xyy1_QKQjnm848vLMZvHAno5S4,5279
|
4 |
+
emoji-2.8.0.dist-info/RECORD,,
|
5 |
+
emoji-2.8.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6 |
+
emoji-2.8.0.dist-info/WHEEL,sha256=m9WAupmBd2JGDsXWQGJgMGXIWbQY3F5c2xBJbBhq0nY,110
|
7 |
+
emoji-2.8.0.dist-info/top_level.txt,sha256=UxKwtYLYBTA8ldfisbxvrXDgSz3eVBOq51i2h2ewato,6
|
8 |
+
emoji-2.8.0.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
9 |
+
emoji/__init__.py,sha256=oTfIYyWbRTIasfAY9smrDjojv4-B1IZ9Iyvb3byY2-Q,2358
|
10 |
+
emoji/__init__.pyi,sha256=1cDfESqCdRhx5WZoqibhao3maHpYXzE6hnfVV3PM4RY,929
|
11 |
+
emoji/__pycache__/__init__.cpython-311.pyc,,
|
12 |
+
emoji/__pycache__/core.cpython-311.pyc,,
|
13 |
+
emoji/__pycache__/tokenizer.cpython-311.pyc,,
|
14 |
+
emoji/core.py,sha256=W82lTuBGprXACnB4DEr3Lm8gbA5gV4JCgLnzPCxyBSk,13856
|
15 |
+
emoji/core.pyi,sha256=c_Zl07Rk9sVK_ZI3LMzPMgYfYHtGWn2JIftyCzzf9a4,1310
|
16 |
+
emoji/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17 |
+
emoji/tokenizer.py,sha256=AXEgXpxdL_RYmRCl_Xl8m5jDQlN3hyXi6QN2fV6m104,11876
|
18 |
+
emoji/tokenizer.pyi,sha256=s76ejAXHxd9lNs4h4ey7MCBHLfeJyxoJRrDSKUwb6UQ,1201
|
19 |
+
emoji/unicode_codes/__init__.py,sha256=DOEFYlvT-18AjU4P8G9F8H1Bcjpj0mHiwjFh0JZoaiA,1291
|
20 |
+
emoji/unicode_codes/__init__.pyi,sha256=hH9q27ed2a45IHQ1tKPdMymb6l2IWL3BTJlL8bx8rY4,243
|
21 |
+
emoji/unicode_codes/__pycache__/__init__.cpython-311.pyc,,
|
22 |
+
emoji/unicode_codes/__pycache__/data_dict.cpython-311.pyc,,
|
23 |
+
emoji/unicode_codes/data_dict.py,sha256=IQAIkhb7TQqIdxulG4AJxII4BiGFDI3Ns67S49x0SP8,3135614
|
24 |
+
emoji/unicode_codes/data_dict.pyi,sha256=HiU_YbtX-SphSyD4ZZ2_veRbZ2OP9i08TAjiESdE8ng,155
|
25 |
+
emoji/unicode_codes/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/REQUESTED
ADDED
File without changes
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/WHEEL
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wheel-Version: 1.0
|
2 |
+
Generator: bdist_wheel (0.41.1)
|
3 |
+
Root-Is-Purelib: true
|
4 |
+
Tag: py2-none-any
|
5 |
+
Tag: py3-none-any
|
6 |
+
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
emoji
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji-2.8.0.dist-info/zip-safe
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/__init__.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
emoji for Python
|
3 |
+
~~~~~~~~~~~~~~~~
|
4 |
+
|
5 |
+
emoji terminal output for Python.
|
6 |
+
|
7 |
+
>>> import emoji
|
8 |
+
>>> print(emoji.emojize('Python is :thumbsup:', language='alias'))
|
9 |
+
Python is π
|
10 |
+
>>> print(emoji.emojize('Python is :thumbs_up:'))
|
11 |
+
Python is π
|
12 |
+
"""
|
13 |
+
|
14 |
+
|
15 |
+
from emoji.core import *
|
16 |
+
from emoji.unicode_codes import *
|
17 |
+
|
18 |
+
__all__ = [
|
19 |
+
# emoji.core
|
20 |
+
'emojize', 'demojize', 'analyze', 'config',
|
21 |
+
'emoji_list', 'distinct_emoji_list', 'emoji_count',
|
22 |
+
'replace_emoji', 'is_emoji', 'purely_emoji', 'version',
|
23 |
+
'Token', 'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI',
|
24 |
+
# emoji.unicode_codes
|
25 |
+
'EMOJI_DATA', 'STATUS', 'LANGUAGES',
|
26 |
+
]
|
27 |
+
|
28 |
+
__version__ = '2.8.0'
|
29 |
+
__author__ = 'Taehoon Kim, Kevin Wurster'
|
30 |
+
__email__ = '[email protected]'
|
31 |
+
# and [email protected], [email protected]
|
32 |
+
__source__ = 'https://github.com/carpedm20/emoji/'
|
33 |
+
__license__ = '''
|
34 |
+
New BSD License
|
35 |
+
|
36 |
+
Copyright (c) 2014-2023, Taehoon Kim, Kevin Wurster
|
37 |
+
All rights reserved.
|
38 |
+
|
39 |
+
Redistribution and use in source and binary forms, with or without
|
40 |
+
modification, are permitted provided that the following conditions are met:
|
41 |
+
|
42 |
+
* Redistributions of source code must retain the above copyright notice, this
|
43 |
+
list of conditions and the following disclaimer.
|
44 |
+
|
45 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
46 |
+
this list of conditions and the following disclaimer in the documentation
|
47 |
+
and/or other materials provided with the distribution.
|
48 |
+
|
49 |
+
* The names of its contributors may not be used to endorse or promote products
|
50 |
+
derived from this software without specific prior written permission.
|
51 |
+
|
52 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
53 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
54 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
55 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
56 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
57 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
58 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
59 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
60 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
61 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
62 |
+
'''
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/__init__.pyi
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .core import (
|
2 |
+
demojize as demojize,
|
3 |
+
distinct_emoji_list as distinct_emoji_list,
|
4 |
+
emoji_count as emoji_count,
|
5 |
+
emoji_list as emoji_list,
|
6 |
+
emojize as emojize,
|
7 |
+
is_emoji as is_emoji,
|
8 |
+
replace_emoji as replace_emoji,
|
9 |
+
version as version,
|
10 |
+
analyze as analyze,
|
11 |
+
config as config,
|
12 |
+
)
|
13 |
+
from .tokenizer import (
|
14 |
+
Token as Token,
|
15 |
+
EmojiMatch as EmojiMatch,
|
16 |
+
EmojiMatchZWJ as EmojiMatchZWJ,
|
17 |
+
EmojiMatchZWJNonRGI as EmojiMatchZWJNonRGI,
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
from .unicode_codes import EMOJI_DATA, LANGUAGES, STATUS
|
22 |
+
|
23 |
+
__all__ = [
|
24 |
+
# emoji.core
|
25 |
+
'emojize', 'demojize', 'analyze', 'config',
|
26 |
+
'emoji_list', 'distinct_emoji_list', 'emoji_count',
|
27 |
+
'replace_emoji', 'is_emoji', 'version',
|
28 |
+
'Token', 'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI',
|
29 |
+
# emoji.unicode_codes
|
30 |
+
'EMOJI_DATA', 'STATUS', 'LANGUAGES',
|
31 |
+
]
|
32 |
+
|
33 |
+
__version__: str
|
34 |
+
__author__: str
|
35 |
+
__email__: str
|
36 |
+
__source__: str
|
37 |
+
__license__: str
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/core.py
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
emoji.core
|
3 |
+
~~~~~~~~~~
|
4 |
+
|
5 |
+
Core components for emoji.
|
6 |
+
|
7 |
+
"""
|
8 |
+
|
9 |
+
import re
|
10 |
+
import unicodedata
|
11 |
+
from typing import Iterator
|
12 |
+
|
13 |
+
from emoji import unicode_codes
|
14 |
+
from emoji.tokenizer import Token, EmojiMatch, EmojiMatchZWJ, EmojiMatchZWJNonRGI, tokenize, filter_tokens
|
15 |
+
|
16 |
+
__all__ = [
|
17 |
+
'emojize', 'demojize', 'analyze', 'config',
|
18 |
+
'emoji_list', 'distinct_emoji_list', 'emoji_count',
|
19 |
+
'replace_emoji', 'is_emoji', 'purely_emoji', 'version',
|
20 |
+
'Token', 'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI',
|
21 |
+
]
|
22 |
+
|
23 |
+
_DEFAULT_DELIMITER = ':'
|
24 |
+
_EMOJI_NAME_PATTERN = '\\w\\-&.βββ()!#*+,/«»\u0300\u0301\u0302\u0303\u0308\u030a\u0327\u064b\u064e\u064f\u0650\u0653\u0654\u3099\u30fb\u309a'
|
25 |
+
|
26 |
+
|
27 |
+
class config():
|
28 |
+
"""Module-wide configuration"""
|
29 |
+
|
30 |
+
demojize_keep_zwj = True
|
31 |
+
"""Change the behavior of :func:`emoji.demojize()` regarding
|
32 |
+
zero-width-joiners (ZWJ/``\\u200D``) in emoji that are not
|
33 |
+
"recommended for general interchange" (non-RGI).
|
34 |
+
It has no effect on RGI emoji.
|
35 |
+
|
36 |
+
For example this family emoji with different skin tones "π¨βπ©πΏβπ§π»βπ¦πΎ" contains four
|
37 |
+
person emoji that are joined together by three ZWJ characters:
|
38 |
+
``π¨\\u200Dπ©πΏ\\u200Dπ§π»\\u200Dπ¦πΎ``
|
39 |
+
|
40 |
+
If ``True``, the zero-width-joiners will be kept and :func:`emoji.emojize()` can
|
41 |
+
reverse the :func:`emoji.demojize()` operation:
|
42 |
+
``emoji.emojize(emoji.demojize(s)) == s``
|
43 |
+
|
44 |
+
The example emoji would be converted to
|
45 |
+
``:man:\\u200d:woman_dark_skin_tone:\\u200d:girl_light_skin_tone:\\u200d:boy_medium-dark_skin_tone:``
|
46 |
+
|
47 |
+
If ``False``, the zero-width-joiners will be removed and :func:`emoji.emojize()`
|
48 |
+
can only reverse the individual emoji: ``emoji.emojize(emoji.demojize(s)) != s``
|
49 |
+
|
50 |
+
The example emoji would be converted to
|
51 |
+
``:man::woman_dark_skin_tone::girl_light_skin_tone::boy_medium-dark_skin_tone:``
|
52 |
+
"""
|
53 |
+
|
54 |
+
replace_emoji_keep_zwj = False
|
55 |
+
"""Change the behavior of :func:`emoji.replace_emoji()` regarding
|
56 |
+
zero-width-joiners (ZWJ/``\\u200D``) in emoji that are not
|
57 |
+
"recommended for general interchange" (non-RGI).
|
58 |
+
It has no effect on RGI emoji.
|
59 |
+
|
60 |
+
See :attr:`config.demojize_keep_zwj` for more information.
|
61 |
+
"""
|
62 |
+
|
63 |
+
|
64 |
+
def emojize(
|
65 |
+
string,
|
66 |
+
delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER),
|
67 |
+
variant=None,
|
68 |
+
language='en',
|
69 |
+
version=None,
|
70 |
+
handle_version=None
|
71 |
+
):
|
72 |
+
"""
|
73 |
+
Replace emoji names in a string with Unicode codes.
|
74 |
+
>>> import emoji
|
75 |
+
>>> print(emoji.emojize("Python is fun :thumbsup:", language='alias'))
|
76 |
+
Python is fun π
|
77 |
+
>>> print(emoji.emojize("Python is fun :thumbs_up:"))
|
78 |
+
Python is fun π
|
79 |
+
>>> print(emoji.emojize("Python is fun {thumbs_up}", delimiters = ("{", "}")))
|
80 |
+
Python is fun π
|
81 |
+
>>> print(emoji.emojize("Python is fun :red_heart:", variant="text_type"))
|
82 |
+
Python is fun β€
|
83 |
+
>>> print(emoji.emojize("Python is fun :red_heart:", variant="emoji_type"))
|
84 |
+
Python is fun β€οΈ # red heart, not black heart
|
85 |
+
|
86 |
+
:param string: String contains emoji names.
|
87 |
+
:param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER. Each delimiter
|
88 |
+
should contain at least one character that is not part of a-zA-Z0-9 and ``_-&.()!?#*+,``.
|
89 |
+
See ``emoji.core._EMOJI_NAME_PATTERN`` for the regular expression of unsafe characters.
|
90 |
+
:param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type")
|
91 |
+
:param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias'
|
92 |
+
to use English aliases
|
93 |
+
:param version: (optional) Max version. If set to an Emoji Version,
|
94 |
+
all emoji above this version will be ignored.
|
95 |
+
:param handle_version: (optional) Replace the emoji above ``version``
|
96 |
+
instead of ignoring it. handle_version can be either a string or a
|
97 |
+
callable; If it is a callable, it's passed the Unicode emoji and the
|
98 |
+
data dict from :data:`EMOJI_DATA` and must return a replacement string
|
99 |
+
to be used::
|
100 |
+
|
101 |
+
handle_version('\\U0001F6EB', {
|
102 |
+
'en' : ':airplane_departure:',
|
103 |
+
'status' : fully_qualified,
|
104 |
+
'E' : 1,
|
105 |
+
'alias' : [':flight_departure:'],
|
106 |
+
'de': ':abflug:',
|
107 |
+
'es': ':aviΓ³n_despegando:',
|
108 |
+
...
|
109 |
+
})
|
110 |
+
|
111 |
+
:raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type'
|
112 |
+
|
113 |
+
"""
|
114 |
+
|
115 |
+
if language == 'alias':
|
116 |
+
language_pack = unicode_codes.get_aliases_unicode_dict()
|
117 |
+
else:
|
118 |
+
language_pack = unicode_codes.get_emoji_unicode_dict(language)
|
119 |
+
|
120 |
+
pattern = re.compile('(%s[%s]+%s)' %
|
121 |
+
(re.escape(delimiters[0]), _EMOJI_NAME_PATTERN, re.escape(delimiters[1])))
|
122 |
+
|
123 |
+
def replace(match):
|
124 |
+
name = match.group(1)[len(delimiters[0]):-len(delimiters[1])]
|
125 |
+
emj = language_pack.get(
|
126 |
+
_DEFAULT_DELIMITER +
|
127 |
+
unicodedata.normalize('NFKC', name) +
|
128 |
+
_DEFAULT_DELIMITER)
|
129 |
+
if emj is None:
|
130 |
+
return match.group(1)
|
131 |
+
|
132 |
+
if version is not None and unicode_codes.EMOJI_DATA[emj]['E'] > version:
|
133 |
+
if callable(handle_version):
|
134 |
+
emj_data = unicode_codes.EMOJI_DATA[emj].copy()
|
135 |
+
emj_data['match_start'] = match.start()
|
136 |
+
emj_data['match_end'] = match.end()
|
137 |
+
return handle_version(emj, emj_data)
|
138 |
+
|
139 |
+
elif handle_version is not None:
|
140 |
+
return str(handle_version)
|
141 |
+
else:
|
142 |
+
return ''
|
143 |
+
|
144 |
+
if variant is None or 'variant' not in unicode_codes.EMOJI_DATA[emj]:
|
145 |
+
return emj
|
146 |
+
|
147 |
+
if emj[-1] == '\uFE0E' or emj[-1] == '\uFE0F':
|
148 |
+
# Remove an existing variant
|
149 |
+
emj = emj[0:-1]
|
150 |
+
if variant == "text_type":
|
151 |
+
return emj + '\uFE0E'
|
152 |
+
elif variant == "emoji_type":
|
153 |
+
return emj + '\uFE0F'
|
154 |
+
else:
|
155 |
+
raise ValueError(
|
156 |
+
"Parameter 'variant' must be either None, 'text_type' or 'emoji_type'")
|
157 |
+
|
158 |
+
return pattern.sub(replace, string)
|
159 |
+
|
160 |
+
|
161 |
+
def analyze(string: str, non_emoji: bool = False, join_emoji: bool = True) -> Iterator[Token]:
|
162 |
+
"""
|
163 |
+
Find unicode emoji in a string. Yield each emoji as a named tuple
|
164 |
+
:class:`Token` ``(chars, EmojiMatch)`` or `:class:`Token` ``(chars, EmojiMatchZWJNonRGI)``.
|
165 |
+
If ``non_emoji`` is True, also yield all other characters as
|
166 |
+
:class:`Token` ``(char, char)`` .
|
167 |
+
|
168 |
+
:param string: String to analyze
|
169 |
+
:param non_emoji: If True also yield all non-emoji characters as Token(char, char)
|
170 |
+
:param join_emoji: If True, multiple EmojiMatch are merged into a single
|
171 |
+
EmojiMatchZWJNonRGI if they are separated only by a ZWJ.
|
172 |
+
"""
|
173 |
+
|
174 |
+
return filter_tokens(
|
175 |
+
tokenize(string, keep_zwj=True), emoji_only=not non_emoji, join_emoji=join_emoji)
|
176 |
+
|
177 |
+
|
178 |
+
def demojize(
|
179 |
+
string,
|
180 |
+
delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER),
|
181 |
+
language='en',
|
182 |
+
version=None,
|
183 |
+
handle_version=None
|
184 |
+
):
|
185 |
+
"""
|
186 |
+
Replace Unicode emoji in a string with emoji shortcodes. Useful for storage.
|
187 |
+
>>> import emoji
|
188 |
+
>>> print(emoji.emojize("Python is fun :thumbs_up:"))
|
189 |
+
Python is fun π
|
190 |
+
>>> print(emoji.demojize("Python is fun π"))
|
191 |
+
Python is fun :thumbs_up:
|
192 |
+
>>> print(emoji.demojize("icode is tricky π―", delimiters=("__", "__")))
|
193 |
+
Unicode is tricky __hushed_face__
|
194 |
+
|
195 |
+
:param string: String contains Unicode characters. MUST BE UNICODE.
|
196 |
+
:param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER``
|
197 |
+
:param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias'
|
198 |
+
to use English aliases
|
199 |
+
:param version: (optional) Max version. If set to an Emoji Version,
|
200 |
+
all emoji above this version will be removed.
|
201 |
+
:param handle_version: (optional) Replace the emoji above ``version``
|
202 |
+
instead of removing it. handle_version can be either a string or a
|
203 |
+
callable ``handle_version(emj: str, data: dict) -> str``; If it is
|
204 |
+
a callable, it's passed the Unicode emoji and the data dict from
|
205 |
+
:data:`EMOJI_DATA` and must return a replacement string to be used.
|
206 |
+
The passed data is in the form of::
|
207 |
+
|
208 |
+
handle_version('\\U0001F6EB', {
|
209 |
+
'en' : ':airplane_departure:',
|
210 |
+
'status' : fully_qualified,
|
211 |
+
'E' : 1,
|
212 |
+
'alias' : [':flight_departure:'],
|
213 |
+
'de': ':abflug:',
|
214 |
+
'es': ':aviΓ³n_despegando:',
|
215 |
+
...
|
216 |
+
})
|
217 |
+
|
218 |
+
"""
|
219 |
+
|
220 |
+
if language == 'alias':
|
221 |
+
language = 'en'
|
222 |
+
_use_aliases = True
|
223 |
+
else:
|
224 |
+
_use_aliases = False
|
225 |
+
|
226 |
+
def handle(emoji_match):
|
227 |
+
if version is not None and emoji_match.data['E'] > version:
|
228 |
+
if callable(handle_version):
|
229 |
+
return handle_version(emoji_match.emoji, emoji_match.data_copy())
|
230 |
+
elif handle_version is not None:
|
231 |
+
return handle_version
|
232 |
+
else:
|
233 |
+
return ''
|
234 |
+
elif language in emoji_match.data:
|
235 |
+
if _use_aliases and 'alias' in emoji_match.data:
|
236 |
+
return delimiters[0] + emoji_match.data['alias'][0][1:-1] + delimiters[1]
|
237 |
+
else:
|
238 |
+
return delimiters[0] + emoji_match.data[language][1:-1] + delimiters[1]
|
239 |
+
else:
|
240 |
+
# The emoji exists, but it is not translated, so we keep the emoji
|
241 |
+
return emoji_match.emoji
|
242 |
+
|
243 |
+
matches = tokenize(string, keep_zwj=config.demojize_keep_zwj)
|
244 |
+
return "".join(str(handle(token.value)) if isinstance(
|
245 |
+
token.value, EmojiMatch) else token.value for token in matches)
|
246 |
+
|
247 |
+
|
248 |
+
def replace_emoji(string, replace='', version=-1):
|
249 |
+
"""
|
250 |
+
Replace Unicode emoji in a customizable string.
|
251 |
+
|
252 |
+
:param string: String contains Unicode characters. MUST BE UNICODE.
|
253 |
+
:param replace: (optional) replace can be either a string or a callable;
|
254 |
+
If it is a callable, it's passed the Unicode emoji and the data dict from
|
255 |
+
:data:`EMOJI_DATA` and must return a replacement string to be used.
|
256 |
+
replace(str, dict) -> str
|
257 |
+
:param version: (optional) Max version. If set to an Emoji Version,
|
258 |
+
only emoji above this version will be replaced.
|
259 |
+
"""
|
260 |
+
|
261 |
+
def handle(emoji_match):
|
262 |
+
if version > -1:
|
263 |
+
if emoji_match.data['E'] > version:
|
264 |
+
if callable(replace):
|
265 |
+
return replace(emoji_match.emoji, emoji_match.data_copy())
|
266 |
+
else:
|
267 |
+
return str(replace)
|
268 |
+
elif callable(replace):
|
269 |
+
return replace(emoji_match.emoji, emoji_match.data_copy())
|
270 |
+
elif replace is not None:
|
271 |
+
return replace
|
272 |
+
return emoji_match.emoji
|
273 |
+
|
274 |
+
matches = tokenize(string, keep_zwj=config.replace_emoji_keep_zwj)
|
275 |
+
if config.replace_emoji_keep_zwj:
|
276 |
+
matches = filter_tokens(
|
277 |
+
matches, emoji_only=False, join_emoji=True)
|
278 |
+
return "".join(str(handle(m.value)) if isinstance(
|
279 |
+
m.value, EmojiMatch) else m.value for m in matches)
|
280 |
+
|
281 |
+
|
282 |
+
def emoji_list(string):
|
283 |
+
"""
|
284 |
+
Returns the location and emoji in list of dict format.
|
285 |
+
>>> emoji.emoji_list("Hi, I am fine. π")
|
286 |
+
[{'match_start': 15, 'match_end': 16, 'emoji': 'π'}]
|
287 |
+
"""
|
288 |
+
|
289 |
+
return [{
|
290 |
+
'match_start': m.value.start,
|
291 |
+
'match_end': m.value.end,
|
292 |
+
'emoji': m.value.emoji,
|
293 |
+
} for m in tokenize(string, keep_zwj=False) if isinstance(m.value, EmojiMatch)]
|
294 |
+
|
295 |
+
|
296 |
+
def distinct_emoji_list(string):
|
297 |
+
"""Returns distinct list of emojis from the string."""
|
298 |
+
distinct_list = list(
|
299 |
+
{e['emoji'] for e in emoji_list(string)}
|
300 |
+
)
|
301 |
+
return distinct_list
|
302 |
+
|
303 |
+
|
304 |
+
def emoji_count(string, unique=False):
|
305 |
+
"""
|
306 |
+
Returns the count of emojis in a string.
|
307 |
+
|
308 |
+
:param unique: (optional) True if count only unique emojis
|
309 |
+
"""
|
310 |
+
if unique:
|
311 |
+
return len(distinct_emoji_list(string))
|
312 |
+
return len(emoji_list(string))
|
313 |
+
|
314 |
+
|
315 |
+
def is_emoji(string):
|
316 |
+
"""
|
317 |
+
Returns True if the string is a single emoji, and it is "recommended for
|
318 |
+
general interchange" by Unicode.org.
|
319 |
+
"""
|
320 |
+
return string in unicode_codes.EMOJI_DATA
|
321 |
+
|
322 |
+
|
323 |
+
def purely_emoji(string: str) -> bool:
|
324 |
+
"""
|
325 |
+
Returns True if the string contains only emojis.
|
326 |
+
This might not imply that `is_emoji` for all the characters, for example,
|
327 |
+
if the string contains variation selectors.
|
328 |
+
"""
|
329 |
+
return all(isinstance(m.value, EmojiMatch) for m in analyze(string, non_emoji=True))
|
330 |
+
|
331 |
+
|
332 |
+
def version(string):
|
333 |
+
"""
|
334 |
+
Returns the Emoji Version of the emoji.
|
335 |
+
|
336 |
+
See https://www.unicode.org/reports/tr51/#Versioning for more information.
|
337 |
+
>>> emoji.version("π")
|
338 |
+
0.6
|
339 |
+
>>> emoji.version(":butterfly:")
|
340 |
+
3
|
341 |
+
|
342 |
+
:param string: An emoji or a text containing an emoji
|
343 |
+
:raises ValueError: if ``string`` does not contain an emoji
|
344 |
+
"""
|
345 |
+
# Try dictionary lookup
|
346 |
+
if string in unicode_codes.EMOJI_DATA:
|
347 |
+
return unicode_codes.EMOJI_DATA[string]['E']
|
348 |
+
|
349 |
+
language_pack = unicode_codes.get_emoji_unicode_dict('en')
|
350 |
+
if string in language_pack:
|
351 |
+
emj_code = language_pack[string]
|
352 |
+
if emj_code in unicode_codes.EMOJI_DATA:
|
353 |
+
return unicode_codes.EMOJI_DATA[emj_code]['E']
|
354 |
+
|
355 |
+
# Try to find first emoji in string
|
356 |
+
version = []
|
357 |
+
|
358 |
+
def f(e, emoji_data):
|
359 |
+
version.append(emoji_data['E'])
|
360 |
+
return ''
|
361 |
+
replace_emoji(string, replace=f, version=-1)
|
362 |
+
if version:
|
363 |
+
return version[0]
|
364 |
+
emojize(string, language='alias', version=-1, handle_version=f)
|
365 |
+
if version:
|
366 |
+
return version[0]
|
367 |
+
for lang_code in unicode_codes._EMOJI_UNICODE:
|
368 |
+
emojize(string, language=lang_code, version=-1, handle_version=f)
|
369 |
+
if version:
|
370 |
+
return version[0]
|
371 |
+
|
372 |
+
raise ValueError("No emoji found in string")
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/core.pyi
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections.abc import Callable
|
2 |
+
from typing_extensions import Literal, TypedDict
|
3 |
+
from typing import Iterator
|
4 |
+
from .tokenizer import Token
|
5 |
+
|
6 |
+
|
7 |
+
class config:
|
8 |
+
demojize_keep_zwj: bool
|
9 |
+
replace_emoji_keep_zwj: bool
|
10 |
+
|
11 |
+
|
12 |
+
class _EmojiListReturn(TypedDict):
|
13 |
+
emoji: str
|
14 |
+
match_start: int
|
15 |
+
match_end: int
|
16 |
+
|
17 |
+
|
18 |
+
def emojize(
|
19 |
+
string: str,
|
20 |
+
delimiters: tuple[str, str] = ...,
|
21 |
+
variant: Literal["text_type", "emoji_type", None] = ...,
|
22 |
+
language: str = ...,
|
23 |
+
version: float | None = ...,
|
24 |
+
handle_version: str | Callable[[str, dict[str, str]], str] | None = ...,
|
25 |
+
) -> str: ...
|
26 |
+
|
27 |
+
|
28 |
+
def demojize(
|
29 |
+
string: str,
|
30 |
+
delimiters: tuple[str, str] = ...,
|
31 |
+
language: str = ...,
|
32 |
+
version: float | None = ...,
|
33 |
+
handle_version: str | Callable[[str, dict[str, str]], str] | None = ...,
|
34 |
+
) -> str: ...
|
35 |
+
|
36 |
+
|
37 |
+
def analyze(string: str, non_emoji: bool,
|
38 |
+
join_emoji: bool) -> Iterator[Token]: ...
|
39 |
+
def replace_emoji(string: str, replace: str | Callable[[
|
40 |
+
str, dict[str, str]], str] = ..., version: float = ...) -> str: ...
|
41 |
+
|
42 |
+
|
43 |
+
def emoji_list(string: str) -> list[_EmojiListReturn]: ...
|
44 |
+
def distinct_emoji_list(string: str) -> list[str]: ...
|
45 |
+
def emoji_count(string: str, unique: bool = ...) -> int: ...
|
46 |
+
def version(string: str) -> float: ...
|
47 |
+
def is_emoji(string: str) -> bool: ...
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/py.typed
ADDED
File without changes
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/tokenizer.py
ADDED
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
emoji.tokenizer
|
3 |
+
~~~~~~~~~~~~~~~
|
4 |
+
|
5 |
+
Components for detecting and tokenizing emoji in strings.
|
6 |
+
|
7 |
+
"""
|
8 |
+
from typing import NamedTuple, Dict, Union, Iterator, Any
|
9 |
+
from emoji import unicode_codes
|
10 |
+
|
11 |
+
|
12 |
+
__all__ = [
|
13 |
+
'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token',
|
14 |
+
'tokenize', 'filter_tokens',
|
15 |
+
]
|
16 |
+
|
17 |
+
_ZWJ = '\u200D'
|
18 |
+
_SEARCH_TREE = None
|
19 |
+
|
20 |
+
|
21 |
+
class EmojiMatch:
|
22 |
+
"""
|
23 |
+
Represents a match of a "recommended for general interchange" (RGI)
|
24 |
+
emoji in a string.
|
25 |
+
"""
|
26 |
+
|
27 |
+
__slots__ = ('emoji', 'start', 'end', 'data')
|
28 |
+
|
29 |
+
def __init__(self, emoji: str, start: int,
|
30 |
+
end: int, data: Union[dict, None]):
|
31 |
+
|
32 |
+
self.emoji = emoji
|
33 |
+
"""The emoji substring"""
|
34 |
+
|
35 |
+
self.start = start
|
36 |
+
"""The start index of the match in the string"""
|
37 |
+
|
38 |
+
self.end = end
|
39 |
+
"""The end index of the match in the string"""
|
40 |
+
|
41 |
+
self.data = data
|
42 |
+
"""The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI"""
|
43 |
+
|
44 |
+
def data_copy(self) -> Dict[str, Any]:
|
45 |
+
"""
|
46 |
+
Returns a copy of the data from :data:`EMOJI_DATA` for this match
|
47 |
+
with the additional keys ``match_start`` and ``match_end``.
|
48 |
+
"""
|
49 |
+
if self.data:
|
50 |
+
emj_data = self.data.copy()
|
51 |
+
emj_data['match_start'] = self.start
|
52 |
+
emj_data['match_end'] = self.end
|
53 |
+
return emj_data
|
54 |
+
else:
|
55 |
+
return {
|
56 |
+
'match_start': self.start,
|
57 |
+
'match_end': self.end
|
58 |
+
}
|
59 |
+
|
60 |
+
def is_zwj(self) -> bool:
|
61 |
+
"""
|
62 |
+
Checks if this is a ZWJ-emoji.
|
63 |
+
|
64 |
+
:returns: True if this is a ZWJ-emoji, False otherwise
|
65 |
+
"""
|
66 |
+
|
67 |
+
return _ZWJ in self.emoji
|
68 |
+
|
69 |
+
def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']:
|
70 |
+
"""
|
71 |
+
Splits a ZWJ-emoji into its constituents.
|
72 |
+
|
73 |
+
:returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self
|
74 |
+
"""
|
75 |
+
|
76 |
+
if self.is_zwj():
|
77 |
+
return EmojiMatchZWJ(self)
|
78 |
+
else:
|
79 |
+
return self
|
80 |
+
|
81 |
+
def __repr__(self) -> str:
|
82 |
+
return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})'
|
83 |
+
|
84 |
+
|
85 |
+
class EmojiMatchZWJ(EmojiMatch):
|
86 |
+
"""
|
87 |
+
Represents a match of multiple emoji in a string that were joined by
|
88 |
+
zero-width-joiners (ZWJ/``\\u200D``)."""
|
89 |
+
|
90 |
+
__slots__ = ('emojis', )
|
91 |
+
|
92 |
+
def __init__(self, match: EmojiMatch):
|
93 |
+
super().__init__(match.emoji, match.start, match.end, match.data)
|
94 |
+
|
95 |
+
self.emojis = []
|
96 |
+
"""List of sub emoji as EmojiMatch objects"""
|
97 |
+
|
98 |
+
i = match.start
|
99 |
+
for e in match.emoji.split(_ZWJ):
|
100 |
+
m = EmojiMatch(
|
101 |
+
e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None))
|
102 |
+
self.emojis.append(m)
|
103 |
+
i += len(e) + 1
|
104 |
+
|
105 |
+
def join(self) -> str:
|
106 |
+
"""
|
107 |
+
Joins a ZWJ-emoji into a string
|
108 |
+
"""
|
109 |
+
|
110 |
+
return _ZWJ.join(e.emoji for e in self.emojis)
|
111 |
+
|
112 |
+
def is_zwj(self) -> bool:
|
113 |
+
return True
|
114 |
+
|
115 |
+
def split(self) -> 'EmojiMatchZWJ':
|
116 |
+
return self
|
117 |
+
|
118 |
+
def __repr__(self) -> str:
|
119 |
+
return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})'
|
120 |
+
|
121 |
+
|
122 |
+
class EmojiMatchZWJNonRGI(EmojiMatchZWJ):
|
123 |
+
"""
|
124 |
+
Represents a match of multiple emoji in a string that were joined by
|
125 |
+
zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji
|
126 |
+
that are not "recommended for general interchange" (non-RGI) by Unicode.org.
|
127 |
+
The data property of this class is always None.
|
128 |
+
"""
|
129 |
+
|
130 |
+
def __init__(self, first_emoji_match: EmojiMatch,
|
131 |
+
second_emoji_match: EmojiMatch):
|
132 |
+
|
133 |
+
self.emojis = [first_emoji_match, second_emoji_match]
|
134 |
+
"""List of sub emoji as EmojiMatch objects"""
|
135 |
+
|
136 |
+
self._update()
|
137 |
+
|
138 |
+
def _update(self):
|
139 |
+
self.emoji = _ZWJ.join(e.emoji for e in self.emojis)
|
140 |
+
self.start = self.emojis[0].start
|
141 |
+
self.end = self.emojis[-1].end
|
142 |
+
self.data = None
|
143 |
+
|
144 |
+
def _add(self, next_emoji_match: EmojiMatch):
|
145 |
+
self.emojis.append(next_emoji_match)
|
146 |
+
self._update()
|
147 |
+
|
148 |
+
|
149 |
+
class Token(NamedTuple):
|
150 |
+
"""
|
151 |
+
A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji
|
152 |
+
or a single character that is not a unicode emoji.
|
153 |
+
"""
|
154 |
+
chars: str
|
155 |
+
value: Union[str, EmojiMatch]
|
156 |
+
|
157 |
+
|
158 |
+
def tokenize(string, keep_zwj: bool) -> Iterator[Token]:
|
159 |
+
"""
|
160 |
+
Finds unicode emoji in a string. Yields all normal characters as a named
|
161 |
+
tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``.
|
162 |
+
|
163 |
+
:param string: String contains unicode characters. MUST BE UNICODE.
|
164 |
+
:param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be
|
165 |
+
skipped or should be yielded as normal characters
|
166 |
+
:return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)``
|
167 |
+
"""
|
168 |
+
|
169 |
+
tree = get_search_tree()
|
170 |
+
EMOJI_DATA = unicode_codes.EMOJI_DATA
|
171 |
+
# result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ]
|
172 |
+
result = []
|
173 |
+
i = 0
|
174 |
+
length = len(string)
|
175 |
+
ignore = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences
|
176 |
+
while i < length:
|
177 |
+
consumed = False
|
178 |
+
char = string[i]
|
179 |
+
if i in ignore:
|
180 |
+
i += 1
|
181 |
+
if char == _ZWJ and keep_zwj:
|
182 |
+
result.append(Token(char, char))
|
183 |
+
continue
|
184 |
+
|
185 |
+
elif char in tree:
|
186 |
+
j = i + 1
|
187 |
+
sub_tree = tree[char]
|
188 |
+
while j < length and string[j] in sub_tree:
|
189 |
+
if j in ignore:
|
190 |
+
break
|
191 |
+
sub_tree = sub_tree[string[j]]
|
192 |
+
j += 1
|
193 |
+
if 'data' in sub_tree:
|
194 |
+
emj_data = sub_tree['data']
|
195 |
+
code_points = string[i:j]
|
196 |
+
|
197 |
+
# We cannot yield the result here, we need to defer
|
198 |
+
# the call until we are sure that the emoji is finished
|
199 |
+
# i.e. we're not inside an ongoing ZWJ-sequence
|
200 |
+
match_obj = EmojiMatch(code_points, i, j, emj_data)
|
201 |
+
|
202 |
+
i = j - 1
|
203 |
+
consumed = True
|
204 |
+
result.append(Token(code_points, match_obj))
|
205 |
+
|
206 |
+
elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree:
|
207 |
+
# the current char is ZWJ and the last match was an emoji
|
208 |
+
ignore.append(i)
|
209 |
+
if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]:
|
210 |
+
# last match was a component, it could be ZWJ+EMOJI+COMPONENT
|
211 |
+
# or ZWJ+COMPONENT
|
212 |
+
i = i - sum(len(t.chars) for t in result[-2:])
|
213 |
+
if string[i] == _ZWJ:
|
214 |
+
# It's ZWJ+COMPONENT, move one back
|
215 |
+
i += 1
|
216 |
+
del result[-1]
|
217 |
+
else:
|
218 |
+
# It's ZWJ+EMOJI+COMPONENT, move two back
|
219 |
+
del result[-2:]
|
220 |
+
else:
|
221 |
+
# last match result[-1] was a normal emoji, move cursor
|
222 |
+
# before the emoji
|
223 |
+
i = i - len(result[-1].chars)
|
224 |
+
del result[-1]
|
225 |
+
continue
|
226 |
+
|
227 |
+
elif result:
|
228 |
+
yield from result
|
229 |
+
result = []
|
230 |
+
|
231 |
+
if not consumed and char != '\uFE0E' and char != '\uFE0F':
|
232 |
+
result.append(Token(char, char))
|
233 |
+
i += 1
|
234 |
+
|
235 |
+
yield from result
|
236 |
+
|
237 |
+
|
238 |
+
def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]:
|
239 |
+
"""
|
240 |
+
Filters the output of `tokenize()`
|
241 |
+
|
242 |
+
:param matches: An iterable of tuples of the form ``(match_str, result)``
|
243 |
+
where ``result`` is either an EmojiMatch or a string.
|
244 |
+
:param emoji_only: If True, only EmojiMatch are returned in the output.
|
245 |
+
If False all characters are returned
|
246 |
+
:param join_emoji: If True, multiple EmojiMatch are merged into
|
247 |
+
a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ.
|
248 |
+
|
249 |
+
:return: An iterable of tuples :class:`Token` ``(char, char)``,
|
250 |
+
:class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)``
|
251 |
+
"""
|
252 |
+
|
253 |
+
if not join_emoji and not emoji_only:
|
254 |
+
yield from matches
|
255 |
+
return
|
256 |
+
|
257 |
+
if not join_emoji:
|
258 |
+
for token in matches:
|
259 |
+
if token.chars != _ZWJ:
|
260 |
+
yield token
|
261 |
+
return
|
262 |
+
|
263 |
+
# Combine multiple EmojiMatch that are separated by ZWJs into
|
264 |
+
# a single EmojiMatchZWJNonRGI
|
265 |
+
previous_is_emoji = False
|
266 |
+
previous_is_zwj = False
|
267 |
+
pre_previous_is_emoji = False
|
268 |
+
accumulator = []
|
269 |
+
for token in matches:
|
270 |
+
pre_previous_is_emoji = previous_is_emoji
|
271 |
+
if previous_is_emoji and token.value == _ZWJ:
|
272 |
+
previous_is_zwj = True
|
273 |
+
elif isinstance(token.value, EmojiMatch):
|
274 |
+
if pre_previous_is_emoji and previous_is_zwj:
|
275 |
+
if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
|
276 |
+
accumulator[-1].value._add(token.value)
|
277 |
+
accumulator[-1] = Token(accumulator[-1].chars +
|
278 |
+
_ZWJ + token.chars, accumulator[-1].value)
|
279 |
+
else:
|
280 |
+
prev = accumulator.pop()
|
281 |
+
accumulator.append(
|
282 |
+
Token(prev.chars + _ZWJ + token.chars,
|
283 |
+
EmojiMatchZWJNonRGI(
|
284 |
+
prev.value,
|
285 |
+
token.value)))
|
286 |
+
else:
|
287 |
+
accumulator.append(token)
|
288 |
+
previous_is_emoji = True
|
289 |
+
previous_is_zwj = False
|
290 |
+
else:
|
291 |
+
# Other character, not an emoji
|
292 |
+
previous_is_emoji = False
|
293 |
+
previous_is_zwj = False
|
294 |
+
yield from accumulator
|
295 |
+
if not emoji_only:
|
296 |
+
yield token
|
297 |
+
accumulator = []
|
298 |
+
yield from accumulator
|
299 |
+
|
300 |
+
|
301 |
+
def get_search_tree() -> Dict[str, Any]:
|
302 |
+
"""
|
303 |
+
Generate a search tree for demojize().
|
304 |
+
Example of a search tree::
|
305 |
+
|
306 |
+
EMOJI_DATA =
|
307 |
+
{'a': {'en': ':Apple:'},
|
308 |
+
'b': {'en': ':Bus:'},
|
309 |
+
'ba': {'en': ':Bat:'},
|
310 |
+
'band': {'en': ':Beatles:'},
|
311 |
+
'bandit': {'en': ':Outlaw:'},
|
312 |
+
'bank': {'en': ':BankOfEngland:'},
|
313 |
+
'bb': {'en': ':BB-gun:'},
|
314 |
+
'c': {'en': ':Car:'}}
|
315 |
+
|
316 |
+
_SEARCH_TREE =
|
317 |
+
{'a': {'data': {'en': ':Apple:'}},
|
318 |
+
'b': {'a': {'data': {'en': ':Bat:'},
|
319 |
+
'n': {'d': {'data': {'en': ':Beatles:'},
|
320 |
+
'i': {'t': {'data': {'en': ':Outlaw:'}}}},
|
321 |
+
'k': {'data': {'en': ':BankOfEngland:'}}}},
|
322 |
+
'b': {'data': {'en': ':BB-gun:'}},
|
323 |
+
'data': {'en': ':Bus:'}},
|
324 |
+
'c': {'data': {'en': ':Car:'}}}
|
325 |
+
|
326 |
+
_SEARCH_TREE
|
327 |
+
/ | ⧡
|
328 |
+
/ | ⧡
|
329 |
+
a b c
|
330 |
+
| / | ⧡ |
|
331 |
+
| / | ⧡ |
|
332 |
+
:Apple: ba :Bus: bb :Car:
|
333 |
+
/ ⧡ |
|
334 |
+
/ ⧡ |
|
335 |
+
:Bat: ban :BB-gun:
|
336 |
+
/ ⧡
|
337 |
+
/ ⧡
|
338 |
+
band bank
|
339 |
+
/ ⧡ |
|
340 |
+
/ ⧡ |
|
341 |
+
bandi :Beatles: :BankOfEngland:
|
342 |
+
|
|
343 |
+
bandit
|
344 |
+
|
|
345 |
+
:Outlaw:
|
346 |
+
|
347 |
+
|
348 |
+
"""
|
349 |
+
global _SEARCH_TREE
|
350 |
+
if _SEARCH_TREE is None:
|
351 |
+
_SEARCH_TREE = {}
|
352 |
+
for emj in unicode_codes.EMOJI_DATA:
|
353 |
+
sub_tree = _SEARCH_TREE
|
354 |
+
lastidx = len(emj) - 1
|
355 |
+
for i, char in enumerate(emj):
|
356 |
+
if char not in sub_tree:
|
357 |
+
sub_tree[char] = {}
|
358 |
+
sub_tree = sub_tree[char]
|
359 |
+
if i == lastidx:
|
360 |
+
sub_tree['data'] = unicode_codes.EMOJI_DATA[emj]
|
361 |
+
return _SEARCH_TREE
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/tokenizer.pyi
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import NamedTuple, Union, Dict, Iterator, Any
|
2 |
+
|
3 |
+
_SearchTree = Dict[str, Union['_SearchTree', dict[str, dict[str, Any]]]]
|
4 |
+
|
5 |
+
_SEARCH_TREE: _SearchTree
|
6 |
+
|
7 |
+
|
8 |
+
class EmojiMatch:
|
9 |
+
emoji: str
|
10 |
+
start: int
|
11 |
+
end: int
|
12 |
+
data: dict[str, Any] | None
|
13 |
+
def __init__(self, emoji: str, start: int,
|
14 |
+
end: int, data: dict | None): ...
|
15 |
+
|
16 |
+
def data_copy(self) -> Dict[str, Any]: ...
|
17 |
+
def is_zwj(self) -> bool: ...
|
18 |
+
def split(self) -> EmojiMatchZWJ | EmojiMatch: ...
|
19 |
+
def __repr__(self) -> str: ...
|
20 |
+
|
21 |
+
|
22 |
+
class EmojiMatchZWJ(EmojiMatch):
|
23 |
+
def __init__(self, match: EmojiMatch): ...
|
24 |
+
def join(self) -> str: ...
|
25 |
+
def is_zwj(self) -> bool: ...
|
26 |
+
def split(self) -> EmojiMatchZWJ: ...
|
27 |
+
def __repr__(self) -> str: ...
|
28 |
+
|
29 |
+
|
30 |
+
class EmojiMatchZWJNonRGI(EmojiMatchZWJ):
|
31 |
+
def __init__(self, first_emoji_match: EmojiMatch,
|
32 |
+
second_emoji_match: EmojiMatch): ...
|
33 |
+
|
34 |
+
|
35 |
+
class Token(NamedTuple):
|
36 |
+
chars: str
|
37 |
+
value: str | EmojiMatch
|
38 |
+
|
39 |
+
|
40 |
+
def tokenize(string, keep_zwj: bool) -> Iterator[Token]: ...
|
41 |
+
|
42 |
+
|
43 |
+
def filter_tokens(matches: Iterator[Token], emoji_only: bool,
|
44 |
+
join_emoji: bool) -> Iterator[Token]: ...
|
45 |
+
|
46 |
+
|
47 |
+
def get_search_tree() -> _SearchTree: ...
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/__init__.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from emoji.unicode_codes.data_dict import *
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
'get_emoji_unicode_dict', 'get_aliases_unicode_dict',
|
5 |
+
'EMOJI_DATA', 'STATUS', 'LANGUAGES'
|
6 |
+
]
|
7 |
+
|
8 |
+
|
9 |
+
_EMOJI_UNICODE = {lang: None for lang in LANGUAGES} # Cache for the language dicts
|
10 |
+
|
11 |
+
_ALIASES_UNICODE = {} # Cache for the aliases dict
|
12 |
+
|
13 |
+
|
14 |
+
def get_emoji_unicode_dict(lang):
|
15 |
+
"""Generate dict containing all fully-qualified and component emoji name for a language
|
16 |
+
The dict is only generated once per language and then cached in _EMOJI_UNICODE[lang]"""
|
17 |
+
|
18 |
+
if _EMOJI_UNICODE[lang] is None:
|
19 |
+
_EMOJI_UNICODE[lang] = {data[lang]: emj for emj, data in EMOJI_DATA.items()
|
20 |
+
if lang in data and data['status'] <= STATUS['fully_qualified']}
|
21 |
+
|
22 |
+
return _EMOJI_UNICODE[lang]
|
23 |
+
|
24 |
+
|
25 |
+
def get_aliases_unicode_dict():
|
26 |
+
"""Generate dict containing all fully-qualified and component aliases
|
27 |
+
The dict is only generated once and then cached in _ALIASES_UNICODE"""
|
28 |
+
|
29 |
+
if not _ALIASES_UNICODE:
|
30 |
+
_ALIASES_UNICODE.update(get_emoji_unicode_dict('en'))
|
31 |
+
for emj, data in EMOJI_DATA.items():
|
32 |
+
if 'alias' in data and data['status'] <= STATUS['fully_qualified']:
|
33 |
+
for alias in data['alias']:
|
34 |
+
_ALIASES_UNICODE[alias] = emj
|
35 |
+
|
36 |
+
return _ALIASES_UNICODE
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/__init__.pyi
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .data_dict import *
|
2 |
+
|
3 |
+
__all__ = ["get_emoji_unicode_dict", "get_aliases_unicode_dict", "EMOJI_DATA", "STATUS", "LANGUAGES"]
|
4 |
+
|
5 |
+
def get_emoji_unicode_dict(lang: str) -> dict[str, str]: ...
|
6 |
+
def get_aliases_unicode_dict() -> dict[str, str]: ...
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/data_dict.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/data_dict.pyi
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
|
3 |
+
__all__ = ["EMOJI_DATA", "STATUS", "LANGUAGES"]
|
4 |
+
|
5 |
+
STATUS: dict[str, int]
|
6 |
+
LANGUAGES: list[str]
|
7 |
+
EMOJI_DATA: dict[str, dict[str, Any]]
|
resources/app/plugins/deepmoji_plugin/DeepMoji/emoji/unicode_codes/py.typed
ADDED
File without changes
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/.gitkeep
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# torchMoji examples
|
2 |
+
|
3 |
+
## Initialization
|
4 |
+
[create_twitter_vocab.py](create_twitter_vocab.py)
|
5 |
+
Create a new vocabulary from a tsv file.
|
6 |
+
|
7 |
+
[tokenize_dataset.py](tokenize_dataset.py)
|
8 |
+
Tokenize a given dataset using the prebuilt vocabulary.
|
9 |
+
|
10 |
+
[vocab_extension.py](vocab_extension.py)
|
11 |
+
Extend the given vocabulary using dataset-specific words.
|
12 |
+
|
13 |
+
[dataset_split.py](dataset_split.py)
|
14 |
+
Split a given dataset into training, validation and testing.
|
15 |
+
|
16 |
+
## Use pretrained model/architecture
|
17 |
+
[score_texts_emojis.py](score_texts_emojis.py)
|
18 |
+
Use torchMoji to score texts for emoji distribution.
|
19 |
+
|
20 |
+
[text_emojize.py](text_emojize.py)
|
21 |
+
Use torchMoji to output emoji visualization from a single text input (mapped from `emoji_overview.png`)
|
22 |
+
|
23 |
+
```sh
|
24 |
+
python examples/text_emojize.py --text "I love mom's cooking\!"
|
25 |
+
# => I love mom's cooking! π π π π β€
|
26 |
+
```
|
27 |
+
|
28 |
+
[encode_texts.py](encode_texts.py)
|
29 |
+
Use torchMoji to encode the text into 2304-dimensional feature vectors for further modeling/analysis.
|
30 |
+
|
31 |
+
## Transfer learning
|
32 |
+
[finetune_youtube_last.py](finetune_youtube_last.py)
|
33 |
+
Finetune the model on the SS-Youtube dataset using the 'last' method.
|
34 |
+
|
35 |
+
[finetune_insults_chain-thaw.py](finetune_insults_chain-thaw.py)
|
36 |
+
Finetune the model on the Kaggle insults dataset (from blog post) using the 'chain-thaw' method.
|
37 |
+
|
38 |
+
[finetune_semeval_class-avg_f1.py](finetune_semeval_class-avg_f1.py)
|
39 |
+
Finetune the model on the SemeEval emotion dataset using the 'full' method and evaluate using the class average F1 metric.
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/__init__.py
ADDED
File without changes
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/create_twitter_vocab.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Creates a vocabulary from a tsv file.
|
2 |
+
"""
|
3 |
+
|
4 |
+
import codecs
|
5 |
+
import example_helper
|
6 |
+
from torchmoji.create_vocab import VocabBuilder
|
7 |
+
from torchmoji.word_generator import TweetWordGenerator
|
8 |
+
|
9 |
+
with codecs.open('../../twitterdata/tweets.2016-09-01', 'rU', 'utf-8') as stream:
|
10 |
+
wg = TweetWordGenerator(stream)
|
11 |
+
vb = VocabBuilder(wg)
|
12 |
+
vb.count_all_words()
|
13 |
+
vb.save_vocab()
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/dataset_split.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
Split a given dataset into three different datasets: training, validation and
|
3 |
+
testing.
|
4 |
+
|
5 |
+
This is achieved by splitting the given list of sentences into three separate
|
6 |
+
lists according to either a given ratio (e.g. [0.7, 0.1, 0.2]) or by an
|
7 |
+
explicit enumeration. The sentences are also tokenised using the given
|
8 |
+
vocabulary.
|
9 |
+
|
10 |
+
Also splits a given list of dictionaries containing information about
|
11 |
+
each sentence.
|
12 |
+
|
13 |
+
An additional parameter can be set 'extend_with', which will extend the given
|
14 |
+
vocabulary with up to 'extend_with' tokens, taken from the training dataset.
|
15 |
+
'''
|
16 |
+
from __future__ import print_function, unicode_literals
|
17 |
+
import example_helper
|
18 |
+
import json
|
19 |
+
|
20 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
21 |
+
|
22 |
+
DATASET = [
|
23 |
+
'I am sentence 0',
|
24 |
+
'I am sentence 1',
|
25 |
+
'I am sentence 2',
|
26 |
+
'I am sentence 3',
|
27 |
+
'I am sentence 4',
|
28 |
+
'I am sentence 5',
|
29 |
+
'I am sentence 6',
|
30 |
+
'I am sentence 7',
|
31 |
+
'I am sentence 8',
|
32 |
+
'I am sentence 9 newword',
|
33 |
+
]
|
34 |
+
|
35 |
+
INFO_DICTS = [
|
36 |
+
{'label': 'sentence 0'},
|
37 |
+
{'label': 'sentence 1'},
|
38 |
+
{'label': 'sentence 2'},
|
39 |
+
{'label': 'sentence 3'},
|
40 |
+
{'label': 'sentence 4'},
|
41 |
+
{'label': 'sentence 5'},
|
42 |
+
{'label': 'sentence 6'},
|
43 |
+
{'label': 'sentence 7'},
|
44 |
+
{'label': 'sentence 8'},
|
45 |
+
{'label': 'sentence 9'},
|
46 |
+
]
|
47 |
+
|
48 |
+
with open('../model/vocabulary.json', 'r') as f:
|
49 |
+
vocab = json.load(f)
|
50 |
+
st = SentenceTokenizer(vocab, 30)
|
51 |
+
|
52 |
+
# Split using the default split ratio
|
53 |
+
print(st.split_train_val_test(DATASET, INFO_DICTS))
|
54 |
+
|
55 |
+
# Split explicitly
|
56 |
+
print(st.split_train_val_test(DATASET,
|
57 |
+
INFO_DICTS,
|
58 |
+
[[0, 1, 2, 4, 9], [5, 6], [7, 8, 3]],
|
59 |
+
extend_with=1))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/encode_texts.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
""" Use torchMoji to encode texts into emotional feature vectors.
|
4 |
+
"""
|
5 |
+
from __future__ import print_function, division, unicode_literals
|
6 |
+
import json
|
7 |
+
|
8 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
9 |
+
from torchmoji.model_def import torchmoji_feature_encoding
|
10 |
+
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
|
11 |
+
|
12 |
+
TEST_SENTENCES = ['I love mom\'s cooking',
|
13 |
+
'I love how you never reply back..',
|
14 |
+
'I love cruising with my homies',
|
15 |
+
'I love messing with yo mind!!',
|
16 |
+
'I love you and now you\'re just gone..',
|
17 |
+
'This is shit',
|
18 |
+
'This is the shit']
|
19 |
+
|
20 |
+
maxlen = 30
|
21 |
+
batch_size = 32
|
22 |
+
|
23 |
+
print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
|
24 |
+
with open(VOCAB_PATH, 'r') as f:
|
25 |
+
vocabulary = json.load(f)
|
26 |
+
st = SentenceTokenizer(vocabulary, maxlen)
|
27 |
+
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
|
28 |
+
|
29 |
+
print('Loading model from {}.'.format(PRETRAINED_PATH))
|
30 |
+
model = torchmoji_feature_encoding(PRETRAINED_PATH)
|
31 |
+
print(model)
|
32 |
+
|
33 |
+
print('Encoding texts..')
|
34 |
+
encoding = model(tokenized)
|
35 |
+
|
36 |
+
print('First 5 dimensions for sentence: {}'.format(TEST_SENTENCES[0]))
|
37 |
+
print(encoding[0,:5])
|
38 |
+
|
39 |
+
# Now you could visualize the encodings to see differences,
|
40 |
+
# run a logistic regression classifier on top,
|
41 |
+
# or basically anything you'd like to do.
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/example_helper.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Module import helper.
|
2 |
+
Modifies PATH in order to allow us to import the torchmoji directory.
|
3 |
+
"""
|
4 |
+
import sys
|
5 |
+
from os.path import abspath, dirname
|
6 |
+
sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/finetune_insults_chain-thaw.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Finetuning example.
|
2 |
+
|
3 |
+
Trains the torchMoji model on the kaggle insults dataset, using the 'chain-thaw'
|
4 |
+
finetuning method and the accuracy metric. See the blog post at
|
5 |
+
https://medium.com/@bjarkefelbo/what-can-we-learn-from-emojis-6beb165a5ea0
|
6 |
+
for more information. Note that results may differ a bit due to slight
|
7 |
+
changes in preprocessing and train/val/test split.
|
8 |
+
|
9 |
+
The 'chain-thaw' method does the following:
|
10 |
+
0) Load all weights except for the softmax layer. Extend the embedding layer if
|
11 |
+
necessary, initialising the new weights with random values.
|
12 |
+
1) Freeze every layer except the last (softmax) layer and train it.
|
13 |
+
2) Freeze every layer except the first layer and train it.
|
14 |
+
3) Freeze every layer except the second etc., until the second last layer.
|
15 |
+
4) Unfreeze all layers and train entire model.
|
16 |
+
"""
|
17 |
+
|
18 |
+
from __future__ import print_function
|
19 |
+
import example_helper
|
20 |
+
import json
|
21 |
+
from torchmoji.model_def import torchmoji_transfer
|
22 |
+
from torchmoji.global_variables import PRETRAINED_PATH
|
23 |
+
from torchmoji.finetuning import (
|
24 |
+
load_benchmark,
|
25 |
+
finetune)
|
26 |
+
|
27 |
+
|
28 |
+
DATASET_PATH = '../data/kaggle-insults/raw.pickle'
|
29 |
+
nb_classes = 2
|
30 |
+
|
31 |
+
with open('../model/vocabulary.json', 'r') as f:
|
32 |
+
vocab = json.load(f)
|
33 |
+
|
34 |
+
# Load dataset. Extend the existing vocabulary with up to 10000 tokens from
|
35 |
+
# the training dataset.
|
36 |
+
data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
|
37 |
+
|
38 |
+
# Set up model and finetune. Note that we have to extend the embedding layer
|
39 |
+
# with the number of tokens added to the vocabulary.
|
40 |
+
model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added'])
|
41 |
+
print(model)
|
42 |
+
model, acc = finetune(model, data['texts'], data['labels'], nb_classes,
|
43 |
+
data['batch_size'], method='chain-thaw')
|
44 |
+
print('Acc: {}'.format(acc))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/finetune_semeval_class-avg_f1.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Finetuning example.
|
2 |
+
|
3 |
+
Trains the torchMoji model on the SemEval emotion dataset, using the 'last'
|
4 |
+
finetuning method and the class average F1 metric.
|
5 |
+
|
6 |
+
The 'last' method does the following:
|
7 |
+
0) Load all weights except for the softmax layer. Do not add tokens to the
|
8 |
+
vocabulary and do not extend the embedding layer.
|
9 |
+
1) Freeze all layers except for the softmax layer.
|
10 |
+
2) Train.
|
11 |
+
|
12 |
+
The class average F1 metric does the following:
|
13 |
+
1) For each class, relabel the dataset into binary classification
|
14 |
+
(belongs to/does not belong to this class).
|
15 |
+
2) Calculate F1 score for each class.
|
16 |
+
3) Compute the average of all F1 scores.
|
17 |
+
"""
|
18 |
+
|
19 |
+
from __future__ import print_function
|
20 |
+
import example_helper
|
21 |
+
import json
|
22 |
+
from torchmoji.finetuning import load_benchmark
|
23 |
+
from torchmoji.class_avg_finetuning import class_avg_finetune
|
24 |
+
from torchmoji.model_def import torchmoji_transfer
|
25 |
+
from torchmoji.global_variables import PRETRAINED_PATH
|
26 |
+
|
27 |
+
DATASET_PATH = '../data/SE0714/raw.pickle'
|
28 |
+
nb_classes = 3
|
29 |
+
|
30 |
+
with open('../model/vocabulary.json', 'r') as f:
|
31 |
+
vocab = json.load(f)
|
32 |
+
|
33 |
+
|
34 |
+
# Load dataset. Extend the existing vocabulary with up to 10000 tokens from
|
35 |
+
# the training dataset.
|
36 |
+
data = load_benchmark(DATASET_PATH, vocab, extend_with=10000)
|
37 |
+
|
38 |
+
# Set up model and finetune. Note that we have to extend the embedding layer
|
39 |
+
# with the number of tokens added to the vocabulary.
|
40 |
+
#
|
41 |
+
# Also note that when using class average F1 to evaluate, the model has to be
|
42 |
+
# defined with two classes, since the model will be trained for each class
|
43 |
+
# separately.
|
44 |
+
model = torchmoji_transfer(2, PRETRAINED_PATH, extend_embedding=data['added'])
|
45 |
+
print(model)
|
46 |
+
|
47 |
+
# For finetuning however, pass in the actual number of classes.
|
48 |
+
model, f1 = class_avg_finetune(model, data['texts'], data['labels'],
|
49 |
+
nb_classes, data['batch_size'], method='last')
|
50 |
+
print('F1: {}'.format(f1))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/finetune_youtube_last.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Finetuning example.
|
2 |
+
|
3 |
+
Trains the torchMoji model on the SS-Youtube dataset, using the 'last'
|
4 |
+
finetuning method and the accuracy metric.
|
5 |
+
|
6 |
+
The 'last' method does the following:
|
7 |
+
0) Load all weights except for the softmax layer. Do not add tokens to the
|
8 |
+
vocabulary and do not extend the embedding layer.
|
9 |
+
1) Freeze all layers except for the softmax layer.
|
10 |
+
2) Train.
|
11 |
+
"""
|
12 |
+
|
13 |
+
from __future__ import print_function
|
14 |
+
import example_helper
|
15 |
+
import json
|
16 |
+
from torchmoji.model_def import torchmoji_transfer
|
17 |
+
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH, ROOT_PATH
|
18 |
+
from torchmoji.finetuning import (
|
19 |
+
load_benchmark,
|
20 |
+
finetune)
|
21 |
+
|
22 |
+
DATASET_PATH = '{}/data/SS-Youtube/raw.pickle'.format(ROOT_PATH)
|
23 |
+
nb_classes = 2
|
24 |
+
|
25 |
+
with open(VOCAB_PATH, 'r') as f:
|
26 |
+
vocab = json.load(f)
|
27 |
+
|
28 |
+
# Load dataset.
|
29 |
+
data = load_benchmark(DATASET_PATH, vocab)
|
30 |
+
|
31 |
+
# Set up model and finetune
|
32 |
+
model = torchmoji_transfer(nb_classes, PRETRAINED_PATH)
|
33 |
+
print(model)
|
34 |
+
model, acc = finetune(model, data['texts'], data['labels'], nb_classes, data['batch_size'], method='last')
|
35 |
+
print('Acc: {}'.format(acc))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/score_texts_emojis.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
""" Use torchMoji to score texts for emoji distribution.
|
4 |
+
|
5 |
+
The resulting emoji ids (0-63) correspond to the mapping
|
6 |
+
in emoji_overview.png file at the root of the torchMoji repo.
|
7 |
+
|
8 |
+
Writes the result to a csv file.
|
9 |
+
"""
|
10 |
+
|
11 |
+
from __future__ import print_function, division, unicode_literals
|
12 |
+
|
13 |
+
import sys
|
14 |
+
from os.path import abspath, dirname
|
15 |
+
|
16 |
+
import json
|
17 |
+
import csv
|
18 |
+
import numpy as np
|
19 |
+
|
20 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
21 |
+
from torchmoji.model_def import torchmoji_emojis
|
22 |
+
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
|
23 |
+
|
24 |
+
sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
25 |
+
|
26 |
+
OUTPUT_PATH = 'test_sentences.csv'
|
27 |
+
|
28 |
+
TEST_SENTENCES = ['I love mom\'s cooking',
|
29 |
+
'I love how you never reply back..',
|
30 |
+
'I love cruising with my homies',
|
31 |
+
'I love messing with yo mind!!',
|
32 |
+
'I love you and now you\'re just gone..',
|
33 |
+
'This is shit',
|
34 |
+
'This is the shit']
|
35 |
+
|
36 |
+
|
37 |
+
def top_elements(array, k):
|
38 |
+
ind = np.argpartition(array, -k)[-k:]
|
39 |
+
return ind[np.argsort(array[ind])][::-1]
|
40 |
+
|
41 |
+
maxlen = 30
|
42 |
+
|
43 |
+
print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
|
44 |
+
with open(VOCAB_PATH, 'r') as f:
|
45 |
+
vocabulary = json.load(f)
|
46 |
+
|
47 |
+
st = SentenceTokenizer(vocabulary, maxlen)
|
48 |
+
|
49 |
+
print('Loading model from {}.'.format(PRETRAINED_PATH))
|
50 |
+
model = torchmoji_emojis(PRETRAINED_PATH)
|
51 |
+
print(model)
|
52 |
+
|
53 |
+
def doImportableFunction():
|
54 |
+
print('Running predictions.')
|
55 |
+
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES)
|
56 |
+
prob = model(tokenized)
|
57 |
+
|
58 |
+
for prob in [prob]:
|
59 |
+
# Find top emojis for each sentence. Emoji ids (0-63)
|
60 |
+
# correspond to the mapping in emoji_overview.png
|
61 |
+
# at the root of the torchMoji repo.
|
62 |
+
print('Writing results to {}'.format(OUTPUT_PATH))
|
63 |
+
scores = []
|
64 |
+
for i, t in enumerate(TEST_SENTENCES):
|
65 |
+
t_tokens = tokenized[i]
|
66 |
+
t_score = [t]
|
67 |
+
t_prob = prob[i]
|
68 |
+
ind_top = top_elements(t_prob, 5)
|
69 |
+
t_score.append(sum(t_prob[ind_top]))
|
70 |
+
t_score.extend(ind_top)
|
71 |
+
t_score.extend([t_prob[ind] for ind in ind_top])
|
72 |
+
scores.append(t_score)
|
73 |
+
print(t_score)
|
74 |
+
|
75 |
+
with open(OUTPUT_PATH, 'w') as csvfile:
|
76 |
+
writer = csv.writer(csvfile, delimiter=str(','), lineterminator='\n')
|
77 |
+
writer.writerow(['Text', 'Top5%',
|
78 |
+
'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4', 'Emoji_5',
|
79 |
+
'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'])
|
80 |
+
for i, row in enumerate(scores):
|
81 |
+
try:
|
82 |
+
writer.writerow(row)
|
83 |
+
except:
|
84 |
+
print("Exception at row {}!".format(i))
|
85 |
+
return
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/text_emojize.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
""" Use torchMoji to predict emojis from a single text input
|
4 |
+
"""
|
5 |
+
|
6 |
+
from __future__ import print_function, division, unicode_literals
|
7 |
+
import example_helper
|
8 |
+
import json
|
9 |
+
import csv
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import emoji
|
14 |
+
|
15 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
16 |
+
from torchmoji.model_def import torchmoji_emojis
|
17 |
+
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
|
18 |
+
|
19 |
+
# Emoji map in emoji_overview.png
|
20 |
+
EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \
|
21 |
+
:pensive: :ok_hand: :blush: :heart: :smirk: \
|
22 |
+
:grin: :notes: :flushed: :100: :sleeping: \
|
23 |
+
:relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \
|
24 |
+
:sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \
|
25 |
+
:neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \
|
26 |
+
:v: :sunglasses: :rage: :thumbsup: :cry: \
|
27 |
+
:sleepy: :yum: :triumph: :hand: :mask: \
|
28 |
+
:clap: :eyes: :gun: :persevere: :smiling_imp: \
|
29 |
+
:sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
|
30 |
+
:wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
|
31 |
+
:angry: :no_good: :muscle: :facepunch: :purple_heart: \
|
32 |
+
:sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')
|
33 |
+
|
34 |
+
def top_elements(array, k):
|
35 |
+
ind = np.argpartition(array, -k)[-k:]
|
36 |
+
return ind[np.argsort(array[ind])][::-1]
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
argparser = argparse.ArgumentParser()
|
40 |
+
argparser.add_argument('--text', type=str, required=True, help="Input text to emojize")
|
41 |
+
argparser.add_argument('--maxlen', type=int, default=30, help="Max length of input text")
|
42 |
+
args = argparser.parse_args()
|
43 |
+
|
44 |
+
# Tokenizing using dictionary
|
45 |
+
with open(VOCAB_PATH, 'r') as f:
|
46 |
+
vocabulary = json.load(f)
|
47 |
+
|
48 |
+
st = SentenceTokenizer(vocabulary, args.maxlen)
|
49 |
+
|
50 |
+
# Loading model
|
51 |
+
model = torchmoji_emojis(PRETRAINED_PATH)
|
52 |
+
# Running predictions
|
53 |
+
tokenized, _, _ = st.tokenize_sentences([args.text])
|
54 |
+
# Get sentence probability
|
55 |
+
prob = model(tokenized)[0]
|
56 |
+
|
57 |
+
# Top emoji id
|
58 |
+
emoji_ids = top_elements(prob, 5)
|
59 |
+
|
60 |
+
# map to emojis
|
61 |
+
emojis = map(lambda x: EMOJIS[x], emoji_ids)
|
62 |
+
|
63 |
+
print(emoji.emojize("{} {}".format(args.text,' '.join(emojis)), use_aliases=True))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/tokenize_dataset.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Take a given list of sentences and turn it into a numpy array, where each
|
3 |
+
number corresponds to a word. Padding is used (number 0) to ensure fixed length
|
4 |
+
of sentences.
|
5 |
+
"""
|
6 |
+
|
7 |
+
from __future__ import print_function, unicode_literals
|
8 |
+
import example_helper
|
9 |
+
import json
|
10 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer
|
11 |
+
|
12 |
+
with open('../model/vocabulary.json', 'r') as f:
|
13 |
+
vocabulary = json.load(f)
|
14 |
+
|
15 |
+
st = SentenceTokenizer(vocabulary, 30)
|
16 |
+
test_sentences = [
|
17 |
+
'\u2014 -- \u203c !!\U0001F602',
|
18 |
+
'Hello world!',
|
19 |
+
'This is a sample tweet #example',
|
20 |
+
]
|
21 |
+
|
22 |
+
tokens, infos, stats = st.tokenize_sentences(test_sentences)
|
23 |
+
|
24 |
+
print(tokens)
|
25 |
+
print(infos)
|
26 |
+
print(stats)
|
resources/app/plugins/deepmoji_plugin/DeepMoji/examples/vocab_extension.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Extend the given vocabulary using dataset-specific words.
|
3 |
+
|
4 |
+
1. First create a vocabulary for the specific dataset.
|
5 |
+
2. Find all words not in our vocabulary, but in the dataset vocabulary.
|
6 |
+
3. Take top X (default=1000) of these words and add them to the vocabulary.
|
7 |
+
4. Save this combined vocabulary and embedding matrix, which can now be used.
|
8 |
+
"""
|
9 |
+
|
10 |
+
from __future__ import print_function, unicode_literals
|
11 |
+
import example_helper
|
12 |
+
import json
|
13 |
+
from torchmoji.create_vocab import extend_vocab, VocabBuilder
|
14 |
+
from torchmoji.word_generator import WordGenerator
|
15 |
+
|
16 |
+
new_words = ['#zzzzaaazzz', 'newword', 'newword']
|
17 |
+
word_gen = WordGenerator(new_words)
|
18 |
+
vb = VocabBuilder(word_gen)
|
19 |
+
vb.count_all_words()
|
20 |
+
|
21 |
+
with open('../model/vocabulary.json') as f:
|
22 |
+
vocab = json.load(f)
|
23 |
+
|
24 |
+
print(len(vocab))
|
25 |
+
print(vb.word_counts)
|
26 |
+
extend_vocab(vocab, vb, max_tokens=1)
|
27 |
+
|
28 |
+
# 'newword' should be added because it's more frequent in the given vocab
|
29 |
+
print(vocab['newword'])
|
30 |
+
print(len(vocab))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/analyze_all_results.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import print_function
|
2 |
+
|
3 |
+
# allow us to import the codebase directory
|
4 |
+
import sys
|
5 |
+
import glob
|
6 |
+
import numpy as np
|
7 |
+
from os.path import dirname, abspath
|
8 |
+
sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
9 |
+
|
10 |
+
DATASETS = ['SE0714', 'Olympic', 'PsychExp', 'SS-Twitter', 'SS-Youtube',
|
11 |
+
'SCv1', 'SV2-GEN'] # 'SE1604' excluded due to Twitter's ToS
|
12 |
+
|
13 |
+
def get_results(dset):
|
14 |
+
METHOD = 'last'
|
15 |
+
RESULTS_DIR = 'results/'
|
16 |
+
RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, dset, METHOD))
|
17 |
+
assert len(RESULT_PATHS)
|
18 |
+
|
19 |
+
scores = []
|
20 |
+
for path in RESULT_PATHS:
|
21 |
+
with open(path) as f:
|
22 |
+
score = f.readline().split(':')[1]
|
23 |
+
scores.append(float(score))
|
24 |
+
|
25 |
+
average = np.mean(scores)
|
26 |
+
maximum = max(scores)
|
27 |
+
minimum = min(scores)
|
28 |
+
std = np.std(scores)
|
29 |
+
|
30 |
+
print('Dataset: {}'.format(dset))
|
31 |
+
print('Method: {}'.format(METHOD))
|
32 |
+
print('Number of results: {}'.format(len(scores)))
|
33 |
+
print('--------------------------')
|
34 |
+
print('Average: {}'.format(average))
|
35 |
+
print('Maximum: {}'.format(maximum))
|
36 |
+
print('Minimum: {}'.format(minimum))
|
37 |
+
print('Standard deviaton: {}'.format(std))
|
38 |
+
|
39 |
+
for dset in DATASETS:
|
40 |
+
get_results(dset)
|
resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/analyze_results.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import print_function
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import glob
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
DATASET = 'SS-Twitter' # 'SE1604' excluded due to Twitter's ToS
|
8 |
+
METHOD = 'new'
|
9 |
+
|
10 |
+
# Optional usage: analyze_results.py <dataset> <method>
|
11 |
+
if len(sys.argv) == 3:
|
12 |
+
DATASET = sys.argv[1]
|
13 |
+
METHOD = sys.argv[2]
|
14 |
+
|
15 |
+
RESULTS_DIR = 'results/'
|
16 |
+
RESULT_PATHS = glob.glob('{}/{}_{}_*_results.txt'.format(RESULTS_DIR, DATASET, METHOD))
|
17 |
+
|
18 |
+
if not RESULT_PATHS:
|
19 |
+
print('Could not find results for \'{}\' using \'{}\' in directory \'{}\'.'.format(DATASET, METHOD, RESULTS_DIR))
|
20 |
+
else:
|
21 |
+
scores = []
|
22 |
+
for path in RESULT_PATHS:
|
23 |
+
with open(path) as f:
|
24 |
+
score = f.readline().split(':')[1]
|
25 |
+
scores.append(float(score))
|
26 |
+
|
27 |
+
average = np.mean(scores)
|
28 |
+
maximum = max(scores)
|
29 |
+
minimum = min(scores)
|
30 |
+
std = np.std(scores)
|
31 |
+
|
32 |
+
print('Dataset: {}'.format(DATASET))
|
33 |
+
print('Method: {}'.format(METHOD))
|
34 |
+
print('Number of results: {}'.format(len(scores)))
|
35 |
+
print('--------------------------')
|
36 |
+
print('Average: {}'.format(average))
|
37 |
+
print('Maximum: {}'.format(maximum))
|
38 |
+
print('Minimum: {}'.format(minimum))
|
39 |
+
print('Standard deviaton: {}'.format(std))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/calculate_coverages.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import print_function
|
2 |
+
import pickle
|
3 |
+
import json
|
4 |
+
import csv
|
5 |
+
import sys
|
6 |
+
from io import open
|
7 |
+
|
8 |
+
# Allow us to import the torchmoji directory
|
9 |
+
from os.path import dirname, abspath
|
10 |
+
sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
11 |
+
|
12 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
|
13 |
+
|
14 |
+
try:
|
15 |
+
unicode # Python 2
|
16 |
+
except NameError:
|
17 |
+
unicode = str # Python 3
|
18 |
+
|
19 |
+
IS_PYTHON2 = int(sys.version[0]) == 2
|
20 |
+
|
21 |
+
OUTPUT_PATH = 'coverage.csv'
|
22 |
+
DATASET_PATHS = [
|
23 |
+
'../data/Olympic/raw.pickle',
|
24 |
+
'../data/PsychExp/raw.pickle',
|
25 |
+
'../data/SCv1/raw.pickle',
|
26 |
+
'../data/SCv2-GEN/raw.pickle',
|
27 |
+
'../data/SE0714/raw.pickle',
|
28 |
+
#'../data/SE1604/raw.pickle', # Excluded due to Twitter's ToS
|
29 |
+
'../data/SS-Twitter/raw.pickle',
|
30 |
+
'../data/SS-Youtube/raw.pickle',
|
31 |
+
]
|
32 |
+
|
33 |
+
with open('../model/vocabulary.json', 'r') as f:
|
34 |
+
vocab = json.load(f)
|
35 |
+
|
36 |
+
results = []
|
37 |
+
for p in DATASET_PATHS:
|
38 |
+
coverage_result = [p]
|
39 |
+
print('Calculating coverage for {}'.format(p))
|
40 |
+
with open(p, 'rb') as f:
|
41 |
+
if IS_PYTHON2:
|
42 |
+
s = pickle.load(f)
|
43 |
+
else:
|
44 |
+
s = pickle.load(f, fix_imports=True)
|
45 |
+
|
46 |
+
# Decode data
|
47 |
+
try:
|
48 |
+
s['texts'] = [unicode(x) for x in s['texts']]
|
49 |
+
except UnicodeDecodeError:
|
50 |
+
s['texts'] = [x.decode('utf-8') for x in s['texts']]
|
51 |
+
|
52 |
+
# Own
|
53 |
+
st = SentenceTokenizer({}, 30)
|
54 |
+
tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
|
55 |
+
[s['train_ind'],
|
56 |
+
s['val_ind'],
|
57 |
+
s['test_ind']],
|
58 |
+
extend_with=10000)
|
59 |
+
coverage_result.append(coverage(tests[2]))
|
60 |
+
|
61 |
+
# Last
|
62 |
+
st = SentenceTokenizer(vocab, 30)
|
63 |
+
tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
|
64 |
+
[s['train_ind'],
|
65 |
+
s['val_ind'],
|
66 |
+
s['test_ind']],
|
67 |
+
extend_with=0)
|
68 |
+
coverage_result.append(coverage(tests[2]))
|
69 |
+
|
70 |
+
# Full
|
71 |
+
st = SentenceTokenizer(vocab, 30)
|
72 |
+
tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
|
73 |
+
[s['train_ind'],
|
74 |
+
s['val_ind'],
|
75 |
+
s['test_ind']],
|
76 |
+
extend_with=10000)
|
77 |
+
coverage_result.append(coverage(tests[2]))
|
78 |
+
|
79 |
+
results.append(coverage_result)
|
80 |
+
|
81 |
+
with open(OUTPUT_PATH, 'wb') as csvfile:
|
82 |
+
writer = csv.writer(csvfile, delimiter='\t', lineterminator='\n')
|
83 |
+
writer.writerow(['Dataset', 'Own', 'Last', 'Full'])
|
84 |
+
for i, row in enumerate(results):
|
85 |
+
try:
|
86 |
+
writer.writerow(row)
|
87 |
+
except:
|
88 |
+
print("Exception at row {}!".format(i))
|
89 |
+
|
90 |
+
print('Saved to {}'.format(OUTPUT_PATH))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/convert_all_datasets.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import print_function
|
2 |
+
|
3 |
+
import json
|
4 |
+
import math
|
5 |
+
import pickle
|
6 |
+
import sys
|
7 |
+
from io import open
|
8 |
+
import numpy as np
|
9 |
+
from os.path import abspath, dirname
|
10 |
+
sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
11 |
+
|
12 |
+
from torchmoji.word_generator import WordGenerator
|
13 |
+
from torchmoji.create_vocab import VocabBuilder
|
14 |
+
from torchmoji.sentence_tokenizer import SentenceTokenizer, extend_vocab, coverage
|
15 |
+
from torchmoji.tokenizer import tokenize
|
16 |
+
|
17 |
+
try:
|
18 |
+
unicode # Python 2
|
19 |
+
except NameError:
|
20 |
+
unicode = str # Python 3
|
21 |
+
|
22 |
+
IS_PYTHON2 = int(sys.version[0]) == 2
|
23 |
+
|
24 |
+
DATASETS = [
|
25 |
+
'Olympic',
|
26 |
+
'PsychExp',
|
27 |
+
'SCv1',
|
28 |
+
'SCv2-GEN',
|
29 |
+
'SE0714',
|
30 |
+
#'SE1604', # Excluded due to Twitter's ToS
|
31 |
+
'SS-Twitter',
|
32 |
+
'SS-Youtube',
|
33 |
+
]
|
34 |
+
|
35 |
+
DIR = '../data'
|
36 |
+
FILENAME_RAW = 'raw.pickle'
|
37 |
+
FILENAME_OWN = 'own_vocab.pickle'
|
38 |
+
FILENAME_OUR = 'twitter_vocab.pickle'
|
39 |
+
FILENAME_COMBINED = 'combined_vocab.pickle'
|
40 |
+
|
41 |
+
|
42 |
+
def roundup(x):
|
43 |
+
return int(math.ceil(x / 10.0)) * 10
|
44 |
+
|
45 |
+
|
46 |
+
def format_pickle(dset, train_texts, val_texts, test_texts, train_labels, val_labels, test_labels):
|
47 |
+
return {'dataset': dset,
|
48 |
+
'train_texts': train_texts,
|
49 |
+
'val_texts': val_texts,
|
50 |
+
'test_texts': test_texts,
|
51 |
+
'train_labels': train_labels,
|
52 |
+
'val_labels': val_labels,
|
53 |
+
'test_labels': test_labels}
|
54 |
+
|
55 |
+
def convert_dataset(filepath, extend_with, vocab):
|
56 |
+
print('-- Generating {} '.format(filepath))
|
57 |
+
sys.stdout.flush()
|
58 |
+
st = SentenceTokenizer(vocab, maxlen)
|
59 |
+
tokenized, dicts, _ = st.split_train_val_test(texts,
|
60 |
+
labels,
|
61 |
+
[data['train_ind'],
|
62 |
+
data['val_ind'],
|
63 |
+
data['test_ind']],
|
64 |
+
extend_with=extend_with)
|
65 |
+
pick = format_pickle(dset, tokenized[0], tokenized[1], tokenized[2],
|
66 |
+
dicts[0], dicts[1], dicts[2])
|
67 |
+
with open(filepath, 'w') as f:
|
68 |
+
pickle.dump(pick, f)
|
69 |
+
cover = coverage(tokenized[2])
|
70 |
+
|
71 |
+
print(' done. Coverage: {}'.format(cover))
|
72 |
+
|
73 |
+
with open('../model/vocabulary.json', 'r') as f:
|
74 |
+
vocab = json.load(f)
|
75 |
+
|
76 |
+
for dset in DATASETS:
|
77 |
+
print('Converting {}'.format(dset))
|
78 |
+
|
79 |
+
PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
|
80 |
+
PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
|
81 |
+
PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
|
82 |
+
PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)
|
83 |
+
|
84 |
+
with open(PATH_RAW, 'rb') as dataset:
|
85 |
+
if IS_PYTHON2:
|
86 |
+
data = pickle.load(dataset)
|
87 |
+
else:
|
88 |
+
data = pickle.load(dataset, fix_imports=True)
|
89 |
+
|
90 |
+
# Decode data
|
91 |
+
try:
|
92 |
+
texts = [unicode(x) for x in data['texts']]
|
93 |
+
except UnicodeDecodeError:
|
94 |
+
texts = [x.decode('utf-8') for x in data['texts']]
|
95 |
+
|
96 |
+
wg = WordGenerator(texts)
|
97 |
+
vb = VocabBuilder(wg)
|
98 |
+
vb.count_all_words()
|
99 |
+
|
100 |
+
# Calculate max length of sequences considered
|
101 |
+
# Adjust batch_size accordingly to prevent GPU overflow
|
102 |
+
lengths = [len(tokenize(t)) for t in texts]
|
103 |
+
maxlen = roundup(np.percentile(lengths, 80.0))
|
104 |
+
|
105 |
+
# Extract labels
|
106 |
+
labels = [x['label'] for x in data['info']]
|
107 |
+
|
108 |
+
convert_dataset(PATH_OWN, 50000, {})
|
109 |
+
convert_dataset(PATH_OUR, 0, vocab)
|
110 |
+
convert_dataset(PATH_COMBINED, 10000, vocab)
|
resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/download_weights.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import print_function
|
2 |
+
import os
|
3 |
+
from subprocess import call
|
4 |
+
from builtins import input
|
5 |
+
|
6 |
+
curr_folder = os.path.basename(os.path.normpath(os.getcwd()))
|
7 |
+
|
8 |
+
weights_filename = 'pytorch_model.bin'
|
9 |
+
weights_folder = 'model'
|
10 |
+
weights_path = '{}/{}'.format(weights_folder, weights_filename)
|
11 |
+
if curr_folder == 'scripts':
|
12 |
+
weights_path = '../' + weights_path
|
13 |
+
weights_download_link = 'https://www.dropbox.com/s/q8lax9ary32c7t9/pytorch_model.bin?dl=0#'
|
14 |
+
|
15 |
+
|
16 |
+
MB_FACTOR = float(1<<20)
|
17 |
+
|
18 |
+
def prompt():
|
19 |
+
while True:
|
20 |
+
valid = {
|
21 |
+
'y': True,
|
22 |
+
'ye': True,
|
23 |
+
'yes': True,
|
24 |
+
'n': False,
|
25 |
+
'no': False,
|
26 |
+
}
|
27 |
+
choice = input().lower()
|
28 |
+
if choice in valid:
|
29 |
+
return valid[choice]
|
30 |
+
else:
|
31 |
+
print('Please respond with \'y\' or \'n\' (or \'yes\' or \'no\')')
|
32 |
+
|
33 |
+
download = True
|
34 |
+
if os.path.exists(weights_path):
|
35 |
+
print('Weight file already exists at {}. Would you like to redownload it anyway? [y/n]'.format(weights_path))
|
36 |
+
download = prompt()
|
37 |
+
already_exists = True
|
38 |
+
else:
|
39 |
+
already_exists = False
|
40 |
+
|
41 |
+
if download:
|
42 |
+
print('About to download the pretrained weights file from {}'.format(weights_download_link))
|
43 |
+
if already_exists == False:
|
44 |
+
print('The size of the file is roughly 85MB. Continue? [y/n]')
|
45 |
+
else:
|
46 |
+
os.unlink(weights_path)
|
47 |
+
|
48 |
+
if already_exists or prompt():
|
49 |
+
print('Downloading...')
|
50 |
+
|
51 |
+
#urllib.urlretrieve(weights_download_link, weights_path)
|
52 |
+
#with open(weights_path,'wb') as f:
|
53 |
+
# f.write(requests.get(weights_download_link).content)
|
54 |
+
|
55 |
+
# downloading using wget due to issues with urlretrieve and requests
|
56 |
+
sys_call = 'wget {} -O {}'.format(weights_download_link, os.path.abspath(weights_path))
|
57 |
+
print("Running system call: {}".format(sys_call))
|
58 |
+
call(sys_call, shell=True)
|
59 |
+
|
60 |
+
if os.path.getsize(weights_path) / MB_FACTOR < 80:
|
61 |
+
raise ValueError("Download finished, but the resulting file is too small! " +
|
62 |
+
"It\'s only {} bytes.".format(os.path.getsize(weights_path)))
|
63 |
+
print('Downloaded weights to {}'.format(weights_path))
|
64 |
+
else:
|
65 |
+
print('Exiting.')
|
resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/finetune_dataset.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Finetuning example.
|
2 |
+
"""
|
3 |
+
from __future__ import print_function
|
4 |
+
import sys
|
5 |
+
import numpy as np
|
6 |
+
from os.path import abspath, dirname
|
7 |
+
sys.path.insert(0, dirname(dirname(abspath(__file__))))
|
8 |
+
|
9 |
+
import json
|
10 |
+
import math
|
11 |
+
from torchmoji.model_def import torchmoji_transfer
|
12 |
+
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
|
13 |
+
from torchmoji.finetuning import (
|
14 |
+
load_benchmark,
|
15 |
+
finetune)
|
16 |
+
from torchmoji.class_avg_finetuning import class_avg_finetune
|
17 |
+
|
18 |
+
def roundup(x):
|
19 |
+
return int(math.ceil(x / 10.0)) * 10
|
20 |
+
|
21 |
+
|
22 |
+
# Format: (dataset_name,
|
23 |
+
# path_to_dataset,
|
24 |
+
# nb_classes,
|
25 |
+
# use_f1_score)
|
26 |
+
DATASETS = [
|
27 |
+
#('SE0714', '../data/SE0714/raw.pickle', 3, True),
|
28 |
+
#('Olympic', '../data/Olympic/raw.pickle', 4, True),
|
29 |
+
#('PsychExp', '../data/PsychExp/raw.pickle', 7, True),
|
30 |
+
#('SS-Twitter', '../data/SS-Twitter/raw.pickle', 2, False),
|
31 |
+
('SS-Youtube', '../data/SS-Youtube/raw.pickle', 2, False),
|
32 |
+
#('SE1604', '../data/SE1604/raw.pickle', 3, False), # Excluded due to Twitter's ToS
|
33 |
+
#('SCv1', '../data/SCv1/raw.pickle', 2, True),
|
34 |
+
#('SCv2-GEN', '../data/SCv2-GEN/raw.pickle', 2, True)
|
35 |
+
]
|
36 |
+
|
37 |
+
RESULTS_DIR = 'results'
|
38 |
+
|
39 |
+
# 'new' | 'last' | 'full' | 'chain-thaw'
|
40 |
+
FINETUNE_METHOD = 'last'
|
41 |
+
VERBOSE = 1
|
42 |
+
|
43 |
+
nb_tokens = 50000
|
44 |
+
nb_epochs = 1000
|
45 |
+
epoch_size = 1000
|
46 |
+
|
47 |
+
with open(VOCAB_PATH, 'r') as f:
|
48 |
+
vocab = json.load(f)
|
49 |
+
|
50 |
+
for rerun_iter in range(5):
|
51 |
+
for p in DATASETS:
|
52 |
+
|
53 |
+
# debugging
|
54 |
+
assert len(vocab) == nb_tokens
|
55 |
+
|
56 |
+
dset = p[0]
|
57 |
+
path = p[1]
|
58 |
+
nb_classes = p[2]
|
59 |
+
use_f1_score = p[3]
|
60 |
+
|
61 |
+
if FINETUNE_METHOD == 'last':
|
62 |
+
extend_with = 0
|
63 |
+
elif FINETUNE_METHOD in ['new', 'full', 'chain-thaw']:
|
64 |
+
extend_with = 10000
|
65 |
+
else:
|
66 |
+
raise ValueError('Finetuning method not recognised!')
|
67 |
+
|
68 |
+
# Load dataset.
|
69 |
+
data = load_benchmark(path, vocab, extend_with=extend_with)
|
70 |
+
|
71 |
+
(X_train, y_train) = (data['texts'][0], data['labels'][0])
|
72 |
+
(X_val, y_val) = (data['texts'][1], data['labels'][1])
|
73 |
+
(X_test, y_test) = (data['texts'][2], data['labels'][2])
|
74 |
+
|
75 |
+
weight_path = PRETRAINED_PATH if FINETUNE_METHOD != 'new' else None
|
76 |
+
nb_model_classes = 2 if use_f1_score else nb_classes
|
77 |
+
model = torchmoji_transfer(
|
78 |
+
nb_model_classes,
|
79 |
+
weight_path,
|
80 |
+
extend_embedding=data['added'])
|
81 |
+
print(model)
|
82 |
+
|
83 |
+
# Training
|
84 |
+
print('Training: {}'.format(path))
|
85 |
+
if use_f1_score:
|
86 |
+
model, result = class_avg_finetune(model, data['texts'],
|
87 |
+
data['labels'],
|
88 |
+
nb_classes, data['batch_size'],
|
89 |
+
FINETUNE_METHOD,
|
90 |
+
verbose=VERBOSE)
|
91 |
+
else:
|
92 |
+
model, result = finetune(model, data['texts'], data['labels'],
|
93 |
+
nb_classes, data['batch_size'],
|
94 |
+
FINETUNE_METHOD, metric='acc',
|
95 |
+
verbose=VERBOSE)
|
96 |
+
|
97 |
+
# Write results
|
98 |
+
if use_f1_score:
|
99 |
+
print('Overall F1 score (dset = {}): {}'.format(dset, result))
|
100 |
+
with open('{}/{}_{}_{}_results.txt'.
|
101 |
+
format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
|
102 |
+
"w") as f:
|
103 |
+
f.write("F1: {}\n".format(result))
|
104 |
+
else:
|
105 |
+
print('Test accuracy (dset = {}): {}'.format(dset, result))
|
106 |
+
with open('{}/{}_{}_{}_results.txt'.
|
107 |
+
format(RESULTS_DIR, dset, FINETUNE_METHOD, rerun_iter),
|
108 |
+
"w") as f:
|
109 |
+
f.write("Acc: {}\n".format(result))
|
resources/app/plugins/deepmoji_plugin/DeepMoji/scripts/results/.gitkeep
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
resources/app/plugins/deepmoji_plugin/DeepMoji/setup.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup
|
2 |
+
|
3 |
+
setup(
|
4 |
+
name='torchmoji',
|
5 |
+
version='1.0',
|
6 |
+
packages=['torchmoji'],
|
7 |
+
description='torchMoji',
|
8 |
+
include_package_data=True,
|
9 |
+
install_requires=[
|
10 |
+
'emoji==0.4.5',
|
11 |
+
'numpy==1.13.1',
|
12 |
+
'scipy==0.19.1',
|
13 |
+
'scikit-learn==0.19.0',
|
14 |
+
'text-unidecode==1.0',
|
15 |
+
],
|
16 |
+
)
|