Spaces:
Running
Running
DeDeckerThomas
commited on
Commit
β’
55dc8b1
1
Parent(s):
8cbff17
Small bug fixes + Code clean-up
Browse files
README.md
CHANGED
@@ -8,15 +8,6 @@ sdk_version: 1.2.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
-
models:
|
12 |
-
- DeDeckerThomas/keyphrase-extraction-kbir-inspec
|
13 |
-
- DeDeckerThomas/keyphrase-extraction-distilbert-openkp
|
14 |
-
- DeDeckerThomas/keyphrase-extraction-distilbert-kptimes
|
15 |
-
- DeDeckerThomas/keyphrase-extraction-distilbert-inspec
|
16 |
-
- DeDeckerThomas/keyphrase-extraction-kbir-kpcrowd
|
17 |
-
- DeDeckerThomas/keyphrase-generation-keybart-inspec
|
18 |
-
- DeDeckerThomas/keyphrase-generation-t5-small-inspec
|
19 |
-
- DeDeckerThomas/keyphrase-generation-t5-small-openkp
|
20 |
---
|
21 |
|
22 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
app.py
CHANGED
@@ -1,12 +1,12 @@
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
-
|
|
|
3 |
from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
|
4 |
from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
|
5 |
-
import orjson
|
6 |
-
|
7 |
-
from annotated_text.util import get_annotated_html
|
8 |
-
import re
|
9 |
-
import string
|
10 |
|
11 |
|
12 |
@st.cache(allow_output_mutation=True, show_spinner=False)
|
@@ -28,7 +28,7 @@ def extract_keyphrases():
|
|
28 |
st.session_state.current_run_id += 1
|
29 |
|
30 |
|
31 |
-
def get_annotated_text(text, keyphrases):
|
32 |
for keyphrase in keyphrases:
|
33 |
text = re.sub(
|
34 |
rf"({keyphrase})([^A-Za-z])",
|
@@ -60,7 +60,7 @@ def get_annotated_text(text, keyphrases):
|
|
60 |
word,
|
61 |
),
|
62 |
"KEY",
|
63 |
-
|
64 |
)
|
65 |
)
|
66 |
else:
|
@@ -73,25 +73,36 @@ def get_annotated_text(text, keyphrases):
|
|
73 |
return result
|
74 |
|
75 |
|
76 |
-
def render_output(layout, runs, reverse=False
|
77 |
runs = list(runs.values())[::-1] if reverse else list(runs.values())
|
78 |
for run in runs:
|
79 |
-
layout.markdown(f"**βοΈ Output run {run.get('run_id')}**")
|
80 |
-
|
81 |
-
layout.markdown(f"**Model**: {run.get('model')}")
|
82 |
-
result = get_annotated_text(run.get("text"), list(run.get("keyphrases")))
|
83 |
-
|
84 |
layout.markdown(
|
85 |
-
|
|
|
|
|
|
|
86 |
unsafe_allow_html=True,
|
87 |
)
|
88 |
-
|
|
|
89 |
abstractive_keyphrases = [
|
90 |
keyphrase
|
91 |
for keyphrase in run.get("keyphrases")
|
92 |
if keyphrase.lower() not in run.get("text").lower()
|
93 |
]
|
94 |
-
layout.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
layout.markdown("---")
|
96 |
|
97 |
|
@@ -102,10 +113,6 @@ if "config" not in st.session_state:
|
|
102 |
st.session_state.history = {}
|
103 |
st.session_state.keyphrases = []
|
104 |
st.session_state.current_run_id = 1
|
105 |
-
st.session_state.chosen_model = st.session_state.config.get("models")[0]
|
106 |
-
|
107 |
-
if "select_rows" not in st.session_state:
|
108 |
-
st.session_state.selected_rows = []
|
109 |
|
110 |
st.set_page_config(
|
111 |
page_icon="π",
|
@@ -130,11 +137,8 @@ context of a document, which is quite an improvement.
|
|
130 |
|
131 |
This space gives you the ability to test around with some keyphrase extraction and generation models.
|
132 |
Keyphrase extraction models are transformers models fine-tuned as a token classification problem where
|
133 |
-
the tokens in a text are annotated as
|
134 |
-
|
135 |
-
* B: Beginning of a keyphrase
|
136 |
-
* I: Inside a keyphrases
|
137 |
-
* O: Outside a keyhprase.
|
138 |
|
139 |
While keyphrase extraction can only extract keyphrases from a given text. Keyphrase generation models
|
140 |
work a bit differently. Here you use an encoder-decoder model like BART to generate keyphrases from a given text.
|
@@ -156,23 +160,27 @@ with st.form("keyphrase-extraction-form"):
|
|
156 |
f"For more information about the chosen model, please be sure to check out the [π€ Model Card](https://huggingface.co/DeDeckerThomas/{st.session_state.chosen_model})."
|
157 |
)
|
158 |
|
159 |
-
st.session_state.input_text =
|
160 |
-
"β Input", st.session_state.config.get("example_text"), height=250
|
161 |
-
|
|
|
|
|
162 |
|
163 |
with st.spinner("Extracting keyphrases..."):
|
164 |
pressed = st.form_submit_button("Extract")
|
165 |
|
166 |
-
if pressed:
|
167 |
with st.spinner("Loading pipeline..."):
|
168 |
pipe = load_pipeline(
|
169 |
f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
|
170 |
)
|
171 |
with st.spinner("Extracting keyphrases"):
|
172 |
extract_keyphrases()
|
|
|
|
|
173 |
|
174 |
options = st.multiselect(
|
175 |
-
"Specify runs you want to see",
|
176 |
st.session_state.history.keys(),
|
177 |
format_func=lambda run_id: f"Run {run_id.split('_')[1]}",
|
178 |
)
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
|
4 |
+
import orjson
|
5 |
import streamlit as st
|
6 |
+
from annotated_text.util import get_annotated_html
|
7 |
+
|
8 |
from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
|
9 |
from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
@st.cache(allow_output_mutation=True, show_spinner=False)
|
|
|
28 |
st.session_state.current_run_id += 1
|
29 |
|
30 |
|
31 |
+
def get_annotated_text(text, keyphrases, color="#d294ff"):
|
32 |
for keyphrase in keyphrases:
|
33 |
text = re.sub(
|
34 |
rf"({keyphrase})([^A-Za-z])",
|
|
|
60 |
word,
|
61 |
),
|
62 |
"KEY",
|
63 |
+
color,
|
64 |
)
|
65 |
)
|
66 |
else:
|
|
|
73 |
return result
|
74 |
|
75 |
|
76 |
+
def render_output(layout, runs, reverse=False):
|
77 |
runs = list(runs.values())[::-1] if reverse else list(runs.values())
|
78 |
for run in runs:
|
|
|
|
|
|
|
|
|
|
|
79 |
layout.markdown(
|
80 |
+
f"""
|
81 |
+
<p style=\"margin-bottom: 0rem\"><strong>Run:</strong> {run.get('run_id')}</p>
|
82 |
+
<p style=\"margin-bottom: 0rem\"><strong>Model:</strong> {run.get('model')}</p>
|
83 |
+
""",
|
84 |
unsafe_allow_html=True,
|
85 |
)
|
86 |
+
|
87 |
+
if "generation" in run.get("model"):
|
88 |
abstractive_keyphrases = [
|
89 |
keyphrase
|
90 |
for keyphrase in run.get("keyphrases")
|
91 |
if keyphrase.lower() not in run.get("text").lower()
|
92 |
]
|
93 |
+
layout.markdown(
|
94 |
+
f"<p style=\"margin-bottom: 0rem\"><strong>Absent keyphrases:</strong> {', '.join(abstractive_keyphrases) if abstractive_keyphrases else 'None' }</p>",
|
95 |
+
unsafe_allow_html=True,
|
96 |
+
)
|
97 |
+
|
98 |
+
result = get_annotated_text(run.get("text"), list(run.get("keyphrases")))
|
99 |
+
layout.markdown(
|
100 |
+
f"""
|
101 |
+
<p style="margin-bottom: 0.5rem"><strong>Text:</strong></p>
|
102 |
+
{get_annotated_html(*result)}
|
103 |
+
""",
|
104 |
+
unsafe_allow_html=True,
|
105 |
+
)
|
106 |
layout.markdown("---")
|
107 |
|
108 |
|
|
|
113 |
st.session_state.history = {}
|
114 |
st.session_state.keyphrases = []
|
115 |
st.session_state.current_run_id = 1
|
|
|
|
|
|
|
|
|
116 |
|
117 |
st.set_page_config(
|
118 |
page_icon="π",
|
|
|
137 |
|
138 |
This space gives you the ability to test around with some keyphrase extraction and generation models.
|
139 |
Keyphrase extraction models are transformers models fine-tuned as a token classification problem where
|
140 |
+
the tokens in a text are annotated as B (Beginning of a keyphrase), I (Inside a keyphrases),
|
141 |
+
and O (Outside a keyhprase).
|
|
|
|
|
|
|
142 |
|
143 |
While keyphrase extraction can only extract keyphrases from a given text. Keyphrase generation models
|
144 |
work a bit differently. Here you use an encoder-decoder model like BART to generate keyphrases from a given text.
|
|
|
160 |
f"For more information about the chosen model, please be sure to check out the [π€ Model Card](https://huggingface.co/DeDeckerThomas/{st.session_state.chosen_model})."
|
161 |
)
|
162 |
|
163 |
+
st.session_state.input_text = (
|
164 |
+
st.text_area("β Input", st.session_state.config.get("example_text"), height=250)
|
165 |
+
.replace("\n", " ")
|
166 |
+
.strip()
|
167 |
+
)
|
168 |
|
169 |
with st.spinner("Extracting keyphrases..."):
|
170 |
pressed = st.form_submit_button("Extract")
|
171 |
|
172 |
+
if pressed and st.session_state.input_text != "":
|
173 |
with st.spinner("Loading pipeline..."):
|
174 |
pipe = load_pipeline(
|
175 |
f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
|
176 |
)
|
177 |
with st.spinner("Extracting keyphrases"):
|
178 |
extract_keyphrases()
|
179 |
+
elif st.session_state.input_text == "":
|
180 |
+
st.error("The text input is empty π Please provide a text in the input field.")
|
181 |
|
182 |
options = st.multiselect(
|
183 |
+
"Specify the runs you want to see",
|
184 |
st.session_state.history.keys(),
|
185 |
format_func=lambda run_id: f"Run {run_id.split('_')[1]}",
|
186 |
)
|
pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc
CHANGED
Binary files a/pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc and b/pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc differ
|
|
pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc
CHANGED
Binary files a/pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc and b/pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc differ
|
|
pipelines/keyphrase_extraction_pipeline.py
CHANGED
@@ -1,10 +1,10 @@
|
|
|
|
1 |
from transformers import (
|
2 |
-
TokenClassificationPipeline,
|
3 |
AutoModelForTokenClassification,
|
4 |
AutoTokenizer,
|
|
|
5 |
)
|
6 |
from transformers.pipelines import AggregationStrategy
|
7 |
-
import numpy as np
|
8 |
|
9 |
|
10 |
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
|
|
1 |
+
import numpy as np
|
2 |
from transformers import (
|
|
|
3 |
AutoModelForTokenClassification,
|
4 |
AutoTokenizer,
|
5 |
+
TokenClassificationPipeline,
|
6 |
)
|
7 |
from transformers.pipelines import AggregationStrategy
|
|
|
8 |
|
9 |
|
10 |
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
|
pipelines/keyphrase_generation_pipeline.py
CHANGED
@@ -1,10 +1,8 @@
|
|
1 |
-
from transformers import (
|
2 |
-
Text2TextGenerationPipeline,
|
3 |
-
AutoModelForSeq2SeqLM,
|
4 |
-
AutoTokenizer,
|
5 |
-
)
|
6 |
import string
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
|
10 |
def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
|
@@ -20,11 +18,11 @@ class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
|
|
20 |
results = super().postprocess(model_outputs=model_outputs)
|
21 |
return [
|
22 |
[
|
23 |
-
keyphrase.strip().translate(str.maketrans(
|
24 |
for keyphrase in result.get("generated_text").split(
|
25 |
self.keyphrase_sep_token
|
26 |
)
|
27 |
-
if keyphrase.translate(str.maketrans(
|
28 |
]
|
29 |
for result in results
|
30 |
][0]
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import string
|
2 |
|
3 |
+
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
|
4 |
+
Text2TextGenerationPipeline)
|
5 |
+
|
6 |
|
7 |
class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
|
8 |
def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
|
|
|
18 |
results = super().postprocess(model_outputs=model_outputs)
|
19 |
return [
|
20 |
[
|
21 |
+
keyphrase.strip().translate(str.maketrans("", "", string.punctuation))
|
22 |
for keyphrase in result.get("generated_text").split(
|
23 |
self.keyphrase_sep_token
|
24 |
)
|
25 |
+
if keyphrase.translate(str.maketrans("", "", string.punctuation)) != ""
|
26 |
]
|
27 |
for result in results
|
28 |
][0]
|
requirements.txt
CHANGED
@@ -2,4 +2,4 @@ orjson==3.6.8
|
|
2 |
transformers[torch]==4.17.0
|
3 |
pandas==1.4.1
|
4 |
numpy==1.22.3
|
5 |
-
st-annotated-text==3.0.0
|
|
|
2 |
transformers[torch]==4.17.0
|
3 |
pandas==1.4.1
|
4 |
numpy==1.22.3
|
5 |
+
st-annotated-text==3.0.0
|