Thomas De Decker commited on
Commit
0f23c4b
β€’
1 Parent(s): 7406ee8

First version

Browse files
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Created by https://www.toptal.com/developers/gitignore/api/python
3
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
4
+
5
+ ### Python ###
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py,cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+ cover/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ .pybuilder/
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ #Pipfile.lock
101
+
102
+ # poetry
103
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
107
+ #poetry.lock
108
+
109
+ # pdm
110
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
111
+ #pdm.lock
112
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
113
+ # in version control.
114
+ # https://pdm.fming.dev/#use-with-ide
115
+ .pdm.toml
116
+
117
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118
+ __pypackages__/
119
+
120
+ # Celery stuff
121
+ celerybeat-schedule
122
+ celerybeat.pid
123
+
124
+ # SageMath parsed files
125
+ *.sage.py
126
+
127
+ # Environments
128
+ .env
129
+ .venv
130
+ env/
131
+ venv/
132
+ ENV/
133
+ env.bak/
134
+ venv.bak/
135
+
136
+ # Spyder project settings
137
+ .spyderproject
138
+ .spyproject
139
+
140
+ # Rope project settings
141
+ .ropeproject
142
+
143
+ # mkdocs documentation
144
+ /site
145
+
146
+ # mypy
147
+ .mypy_cache/
148
+ .dmypy.json
149
+ dmypy.json
150
+
151
+ # Pyre type checker
152
+ .pyre/
153
+
154
+ # pytype static type analyzer
155
+ .pytype/
156
+
157
+ # Cython debug symbols
158
+ cython_debug/
159
+
160
+ # PyCharm
161
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
164
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ #.idea/
166
+
167
+ # End of https://www.toptal.com/developers/gitignore/api/python
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from extraction.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
4
+ from extraction.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
5
+ import orjson
6
+
7
+
8
+ if "config" not in st.session_state:
9
+ with open("config.json", "r") as f:
10
+ content = f.read()
11
+ st.session_state.config = orjson.loads(content)
12
+
13
+ st.set_page_config(
14
+ page_icon="πŸ”‘",
15
+ page_title="Keyphrase extraction/generation with Transformers",
16
+ layout="wide",
17
+ initial_sidebar_state="auto",
18
+ )
19
+
20
+
21
+ @st.cache(allow_output_mutation=True)
22
+ def load_pipeline(chosen_model):
23
+ if "keyphrase-extraction" in chosen_model:
24
+ return KeyphraseExtractionPipeline(chosen_model)
25
+ elif "keyphrase-generation" in chosen_model:
26
+ return KeyphraseGenerationPipeline(chosen_model)
27
+
28
+
29
+ def extract_keyphrases():
30
+ st.session_state.keyphrases = pipe(st.session_state.input_text)
31
+
32
+
33
+ st.header("πŸ”‘ Keyphrase extraction/generation with Transformers")
34
+ col1, col2 = st.columns([1, 3])
35
+
36
+ col1.subheader("Select model")
37
+ chosen_model = col1.selectbox(
38
+ "Choose your model:",
39
+ st.session_state.config.get("models"),
40
+ )
41
+ st.session_state.chosen_model = chosen_model
42
+
43
+ pipe = load_pipeline(st.session_state.chosen_model)
44
+
45
+ col2.subheader("Input your text")
46
+ st.session_state.input_text = col2.text_area(
47
+ "Input", st.session_state.config.get("example_text"), height=150
48
+ )
49
+ pressed = col2.button("Extract", on_click=extract_keyphrases)
50
+
51
+ if pressed:
52
+ col2.subheader("🐧 Output")
53
+ df = pd.DataFrame(data=st.session_state.keyphrases, columns=["Keyphrases"])
54
+ col2.table(df)
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "example_text": "Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it. Currently, classical machine learning methods, that use statistics and linguistics, are widely used for the extraction process. The fact that these methods have been widely used in the community has the advantage that there are many easy-to-use libraries. Now with the recent innovations in deep learning methods (such as recurrent neural networks and transformers, GANS, ... ), keyphrase extraction can be improved. These new methods also focus on the semantics and context of a document, which is quite an improvement.",
3
+ "models": [
4
+ "DeDeckerThomas/keyphrase-extraction-kbir-inspec",
5
+ "DeDeckerThomas/keyphrase-extraction-distilbert-inspec",
6
+ "DeDeckerThomas/keyphrase-extraction-distilbert-openkp",
7
+ "DeDeckerThomas/keyphrase-extraction-distilbert-kptimes",
8
+ "DeDeckerThomas/keyphrase-extraction-kbir-kpcrowd",
9
+ "DeDeckerThomas/keyphrase-generation-keybart-inspec",
10
+ "DeDeckerThomas/keyphrase-generation-t5-small-inspec",
11
+ "DeDeckerThomas/keyphrase-generation-t5-small-openkp"
12
+ ]
13
+ }
extraction/keyphrase_extraction_pipeline.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ TokenClassificationPipeline,
3
+ AutoModelForTokenClassification,
4
+ AutoTokenizer,
5
+ )
6
+ from transformers.pipelines import AggregationStrategy
7
+ import numpy as np
8
+
9
+
10
+ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
11
+ def __init__(self, model, *args, **kwargs):
12
+ super().__init__(
13
+ model=AutoModelForTokenClassification.from_pretrained(model),
14
+ tokenizer=AutoTokenizer.from_pretrained(model),
15
+ *args,
16
+ **kwargs
17
+ )
18
+
19
+ def postprocess(self, model_outputs):
20
+ results = super().postprocess(
21
+ model_outputs=model_outputs,
22
+ aggregation_strategy=AggregationStrategy.SIMPLE,
23
+ )
24
+ return np.unique([result.get("word").strip() for result in results])
extraction/keyphrase_generation_pipeline.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ Text2TextGenerationPipeline,
3
+ AutoModelForSeq2SeqLM,
4
+ AutoTokenizer,
5
+ )
6
+
7
+
8
+ class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
9
+ def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
10
+ super().__init__(
11
+ model=AutoModelForSeq2SeqLM.from_pretrained(model),
12
+ tokenizer=AutoTokenizer.from_pretrained(model),
13
+ *args,
14
+ **kwargs
15
+ )
16
+ self.keyphrase_sep_token = keyphrase_sep_token
17
+
18
+ def postprocess(self, model_outputs):
19
+ results = super().postprocess(model_outputs=model_outputs)
20
+ return [
21
+ [
22
+ keyphrase.strip()
23
+ for keyphrase in result.get("generated_text").split(
24
+ self.keyphrase_sep_token
25
+ )
26
+ ]
27
+ for result in results
28
+ ][0]