DeDeckerThomas commited on
Commit
55dc8b1
β€’
1 Parent(s): 8cbff17

Small bug fixes + Code clean-up

Browse files
README.md CHANGED
@@ -8,15 +8,6 @@ sdk_version: 1.2.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- models:
12
- - DeDeckerThomas/keyphrase-extraction-kbir-inspec
13
- - DeDeckerThomas/keyphrase-extraction-distilbert-openkp
14
- - DeDeckerThomas/keyphrase-extraction-distilbert-kptimes
15
- - DeDeckerThomas/keyphrase-extraction-distilbert-inspec
16
- - DeDeckerThomas/keyphrase-extraction-kbir-kpcrowd
17
- - DeDeckerThomas/keyphrase-generation-keybart-inspec
18
- - DeDeckerThomas/keyphrase-generation-t5-small-inspec
19
- - DeDeckerThomas/keyphrase-generation-t5-small-openkp
20
  ---
21
 
22
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py CHANGED
@@ -1,12 +1,12 @@
 
 
 
 
1
  import streamlit as st
2
- import pandas as pd
 
3
  from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
4
  from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
5
- import orjson
6
-
7
- from annotated_text.util import get_annotated_html
8
- import re
9
- import string
10
 
11
 
12
  @st.cache(allow_output_mutation=True, show_spinner=False)
@@ -28,7 +28,7 @@ def extract_keyphrases():
28
  st.session_state.current_run_id += 1
29
 
30
 
31
- def get_annotated_text(text, keyphrases):
32
  for keyphrase in keyphrases:
33
  text = re.sub(
34
  rf"({keyphrase})([^A-Za-z])",
@@ -60,7 +60,7 @@ def get_annotated_text(text, keyphrases):
60
  word,
61
  ),
62
  "KEY",
63
- "#21c354",
64
  )
65
  )
66
  else:
@@ -73,25 +73,36 @@ def get_annotated_text(text, keyphrases):
73
  return result
74
 
75
 
76
- def render_output(layout, runs, reverse=False, multi_select=False):
77
  runs = list(runs.values())[::-1] if reverse else list(runs.values())
78
  for run in runs:
79
- layout.markdown(f"**βš™οΈ Output run {run.get('run_id')}**")
80
-
81
- layout.markdown(f"**Model**: {run.get('model')}")
82
- result = get_annotated_text(run.get("text"), list(run.get("keyphrases")))
83
-
84
  layout.markdown(
85
- get_annotated_html(*result),
 
 
 
86
  unsafe_allow_html=True,
87
  )
88
- if "generation" in st.session_state.chosen_model:
 
89
  abstractive_keyphrases = [
90
  keyphrase
91
  for keyphrase in run.get("keyphrases")
92
  if keyphrase.lower() not in run.get("text").lower()
93
  ]
94
- layout.write(", ".join(abstractive_keyphrases))
 
 
 
 
 
 
 
 
 
 
 
 
95
  layout.markdown("---")
96
 
97
 
@@ -102,10 +113,6 @@ if "config" not in st.session_state:
102
  st.session_state.history = {}
103
  st.session_state.keyphrases = []
104
  st.session_state.current_run_id = 1
105
- st.session_state.chosen_model = st.session_state.config.get("models")[0]
106
-
107
- if "select_rows" not in st.session_state:
108
- st.session_state.selected_rows = []
109
 
110
  st.set_page_config(
111
  page_icon="πŸ”‘",
@@ -130,11 +137,8 @@ context of a document, which is quite an improvement.
130
 
131
  This space gives you the ability to test around with some keyphrase extraction and generation models.
132
  Keyphrase extraction models are transformers models fine-tuned as a token classification problem where
133
- the tokens in a text are annotated as:
134
-
135
- * B: Beginning of a keyphrase
136
- * I: Inside a keyphrases
137
- * O: Outside a keyhprase.
138
 
139
  While keyphrase extraction can only extract keyphrases from a given text. Keyphrase generation models
140
  work a bit differently. Here you use an encoder-decoder model like BART to generate keyphrases from a given text.
@@ -156,23 +160,27 @@ with st.form("keyphrase-extraction-form"):
156
  f"For more information about the chosen model, please be sure to check out the [πŸ€— Model Card](https://huggingface.co/DeDeckerThomas/{st.session_state.chosen_model})."
157
  )
158
 
159
- st.session_state.input_text = st.text_area(
160
- "✍ Input", st.session_state.config.get("example_text"), height=250
161
- ).replace("\n", " ")
 
 
162
 
163
  with st.spinner("Extracting keyphrases..."):
164
  pressed = st.form_submit_button("Extract")
165
 
166
- if pressed:
167
  with st.spinner("Loading pipeline..."):
168
  pipe = load_pipeline(
169
  f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
170
  )
171
  with st.spinner("Extracting keyphrases"):
172
  extract_keyphrases()
 
 
173
 
174
  options = st.multiselect(
175
- "Specify runs you want to see",
176
  st.session_state.history.keys(),
177
  format_func=lambda run_id: f"Run {run_id.split('_')[1]}",
178
  )
 
1
+ import re
2
+ import string
3
+
4
+ import orjson
5
  import streamlit as st
6
+ from annotated_text.util import get_annotated_html
7
+
8
  from pipelines.keyphrase_extraction_pipeline import KeyphraseExtractionPipeline
9
  from pipelines.keyphrase_generation_pipeline import KeyphraseGenerationPipeline
 
 
 
 
 
10
 
11
 
12
  @st.cache(allow_output_mutation=True, show_spinner=False)
 
28
  st.session_state.current_run_id += 1
29
 
30
 
31
+ def get_annotated_text(text, keyphrases, color="#d294ff"):
32
  for keyphrase in keyphrases:
33
  text = re.sub(
34
  rf"({keyphrase})([^A-Za-z])",
 
60
  word,
61
  ),
62
  "KEY",
63
+ color,
64
  )
65
  )
66
  else:
 
73
  return result
74
 
75
 
76
+ def render_output(layout, runs, reverse=False):
77
  runs = list(runs.values())[::-1] if reverse else list(runs.values())
78
  for run in runs:
 
 
 
 
 
79
  layout.markdown(
80
+ f"""
81
+ <p style=\"margin-bottom: 0rem\"><strong>Run:</strong> {run.get('run_id')}</p>
82
+ <p style=\"margin-bottom: 0rem\"><strong>Model:</strong> {run.get('model')}</p>
83
+ """,
84
  unsafe_allow_html=True,
85
  )
86
+
87
+ if "generation" in run.get("model"):
88
  abstractive_keyphrases = [
89
  keyphrase
90
  for keyphrase in run.get("keyphrases")
91
  if keyphrase.lower() not in run.get("text").lower()
92
  ]
93
+ layout.markdown(
94
+ f"<p style=\"margin-bottom: 0rem\"><strong>Absent keyphrases:</strong> {', '.join(abstractive_keyphrases) if abstractive_keyphrases else 'None' }</p>",
95
+ unsafe_allow_html=True,
96
+ )
97
+
98
+ result = get_annotated_text(run.get("text"), list(run.get("keyphrases")))
99
+ layout.markdown(
100
+ f"""
101
+ <p style="margin-bottom: 0.5rem"><strong>Text:</strong></p>
102
+ {get_annotated_html(*result)}
103
+ """,
104
+ unsafe_allow_html=True,
105
+ )
106
  layout.markdown("---")
107
 
108
 
 
113
  st.session_state.history = {}
114
  st.session_state.keyphrases = []
115
  st.session_state.current_run_id = 1
 
 
 
 
116
 
117
  st.set_page_config(
118
  page_icon="πŸ”‘",
 
137
 
138
  This space gives you the ability to test around with some keyphrase extraction and generation models.
139
  Keyphrase extraction models are transformers models fine-tuned as a token classification problem where
140
+ the tokens in a text are annotated as B (Beginning of a keyphrase), I (Inside a keyphrases),
141
+ and O (Outside a keyhprase).
 
 
 
142
 
143
  While keyphrase extraction can only extract keyphrases from a given text. Keyphrase generation models
144
  work a bit differently. Here you use an encoder-decoder model like BART to generate keyphrases from a given text.
 
160
  f"For more information about the chosen model, please be sure to check out the [πŸ€— Model Card](https://huggingface.co/DeDeckerThomas/{st.session_state.chosen_model})."
161
  )
162
 
163
+ st.session_state.input_text = (
164
+ st.text_area("✍ Input", st.session_state.config.get("example_text"), height=250)
165
+ .replace("\n", " ")
166
+ .strip()
167
+ )
168
 
169
  with st.spinner("Extracting keyphrases..."):
170
  pressed = st.form_submit_button("Extract")
171
 
172
+ if pressed and st.session_state.input_text != "":
173
  with st.spinner("Loading pipeline..."):
174
  pipe = load_pipeline(
175
  f"{st.session_state.config.get('model_author')}/{st.session_state.chosen_model}"
176
  )
177
  with st.spinner("Extracting keyphrases"):
178
  extract_keyphrases()
179
+ elif st.session_state.input_text == "":
180
+ st.error("The text input is empty πŸ™ƒ Please provide a text in the input field.")
181
 
182
  options = st.multiselect(
183
+ "Specify the runs you want to see",
184
  st.session_state.history.keys(),
185
  format_func=lambda run_id: f"Run {run_id.split('_')[1]}",
186
  )
pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc CHANGED
Binary files a/pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc and b/pipelines/__pycache__/keyphrase_extraction_pipeline.cpython-39.pyc differ
 
pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc CHANGED
Binary files a/pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc and b/pipelines/__pycache__/keyphrase_generation_pipeline.cpython-39.pyc differ
 
pipelines/keyphrase_extraction_pipeline.py CHANGED
@@ -1,10 +1,10 @@
 
1
  from transformers import (
2
- TokenClassificationPipeline,
3
  AutoModelForTokenClassification,
4
  AutoTokenizer,
 
5
  )
6
  from transformers.pipelines import AggregationStrategy
7
- import numpy as np
8
 
9
 
10
  class KeyphraseExtractionPipeline(TokenClassificationPipeline):
 
1
+ import numpy as np
2
  from transformers import (
 
3
  AutoModelForTokenClassification,
4
  AutoTokenizer,
5
+ TokenClassificationPipeline,
6
  )
7
  from transformers.pipelines import AggregationStrategy
 
8
 
9
 
10
  class KeyphraseExtractionPipeline(TokenClassificationPipeline):
pipelines/keyphrase_generation_pipeline.py CHANGED
@@ -1,10 +1,8 @@
1
- from transformers import (
2
- Text2TextGenerationPipeline,
3
- AutoModelForSeq2SeqLM,
4
- AutoTokenizer,
5
- )
6
  import string
7
 
 
 
 
8
 
9
  class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
10
  def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
@@ -20,11 +18,11 @@ class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
20
  results = super().postprocess(model_outputs=model_outputs)
21
  return [
22
  [
23
- keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
24
  for keyphrase in result.get("generated_text").split(
25
  self.keyphrase_sep_token
26
  )
27
- if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ""
28
  ]
29
  for result in results
30
  ][0]
 
 
 
 
 
 
1
  import string
2
 
3
+ from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
4
+ Text2TextGenerationPipeline)
5
+
6
 
7
  class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
8
  def __init__(self, model, keyphrase_sep_token=";", *args, **kwargs):
 
18
  results = super().postprocess(model_outputs=model_outputs)
19
  return [
20
  [
21
+ keyphrase.strip().translate(str.maketrans("", "", string.punctuation))
22
  for keyphrase in result.get("generated_text").split(
23
  self.keyphrase_sep_token
24
  )
25
+ if keyphrase.translate(str.maketrans("", "", string.punctuation)) != ""
26
  ]
27
  for result in results
28
  ][0]
requirements.txt CHANGED
@@ -2,4 +2,4 @@ orjson==3.6.8
2
  transformers[torch]==4.17.0
3
  pandas==1.4.1
4
  numpy==1.22.3
5
- st-annotated-text==3.0.0
 
2
  transformers[torch]==4.17.0
3
  pandas==1.4.1
4
  numpy==1.22.3
5
+ st-annotated-text==3.0.0