Spaces:
Build error
Build error
added parameter sliders, added ONXX optimization for T5
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from keybert import KeyBERT
|
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from keyphrase_vectorizers import KeyphraseCountVectorizer
|
9 |
from transformers import T5ForConditionalGeneration,T5Tokenizer
|
|
|
10 |
|
11 |
import nltk
|
12 |
from nltk.tokenize import sent_tokenize
|
@@ -17,26 +18,26 @@ import streamlit as st
|
|
17 |
import traceback
|
18 |
import logging
|
19 |
|
20 |
-
|
21 |
-
nltk.download('stopwords')
|
22 |
-
nltk.download('punkt')
|
23 |
-
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
26 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
|
28 |
HfFolder.save_token(st.secrets["hf-auth-token"])
|
29 |
-
|
30 |
|
31 |
@st.cache(allow_output_mutation=True)
|
32 |
def load_model():
|
33 |
try:
|
|
|
|
|
34 |
# Load KeyBert Model
|
35 |
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
|
36 |
kw_extractor = KeyBERT(tmp_model)
|
37 |
|
38 |
# Load T5 for Paraphrasing
|
39 |
-
|
|
|
|
|
40 |
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
41 |
t5_model = t5_model.to(device)
|
42 |
return kw_extractor, t5_model, t5_tokenizer
|
@@ -90,7 +91,7 @@ def t5_paraphraser(text, number_of_results=5):
|
|
90 |
|
91 |
|
92 |
#### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
|
93 |
-
def extract_paraphrased_sentences(article):
|
94 |
try:
|
95 |
start1 = time.time()
|
96 |
with st.spinner('Extraction Keywords from Original Document...'):
|
@@ -106,8 +107,8 @@ def extract_paraphrased_sentences(article):
|
|
106 |
|
107 |
for sent in target_sentences:
|
108 |
### T5
|
109 |
-
t5_paraphrased = t5_paraphraser(sent)
|
110 |
-
t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
|
111 |
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
|
112 |
|
113 |
t5_paraphrasing_keywords.extend(t5_keywords)
|
@@ -126,13 +127,25 @@ def extract_paraphrased_sentences(article):
|
|
126 |
st.error('Error running Extraction Pipeline. Please contact admin')
|
127 |
logger.error(traceback.format_exc())
|
128 |
|
129 |
-
doc = st.text_area("Enter a custom document")
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
if doc:
|
132 |
-
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc)
|
133 |
|
134 |
# extract_paraphrased_article(input_list[0])
|
135 |
-
st.text(f'
|
136 |
|
137 |
st.subheader('\nOriginal Keywords Extracted:\n\n')
|
138 |
st.dataframe(original_keywords_df)
|
|
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from keyphrase_vectorizers import KeyphraseCountVectorizer
|
9 |
from transformers import T5ForConditionalGeneration,T5Tokenizer
|
10 |
+
from fastT5 import export_and_get_onnx_model, set_auth_token
|
11 |
|
12 |
import nltk
|
13 |
from nltk.tokenize import sent_tokenize
|
|
|
18 |
import traceback
|
19 |
import logging
|
20 |
|
|
|
|
|
|
|
|
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
23 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
24 |
|
25 |
HfFolder.save_token(st.secrets["hf-auth-token"])
|
26 |
+
set_auth_token(st.secrets["hf-auth-token"])
|
27 |
|
28 |
@st.cache(allow_output_mutation=True)
|
29 |
def load_model():
|
30 |
try:
|
31 |
+
nltk.download('stopwords')
|
32 |
+
nltk.download('punkt')
|
33 |
# Load KeyBert Model
|
34 |
tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
|
35 |
kw_extractor = KeyBERT(tmp_model)
|
36 |
|
37 |
# Load T5 for Paraphrasing
|
38 |
+
|
39 |
+
# t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
|
40 |
+
t5_model = export_and_get_onnx_model('valurank/t5-paraphraser')
|
41 |
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
|
42 |
t5_model = t5_model.to(device)
|
43 |
return kw_extractor, t5_model, t5_tokenizer
|
|
|
91 |
|
92 |
|
93 |
#### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
|
94 |
+
def extract_paraphrased_sentences(article, number_of_keywords, number_of_paraphrases):
|
95 |
try:
|
96 |
start1 = time.time()
|
97 |
with st.spinner('Extraction Keywords from Original Document...'):
|
|
|
107 |
|
108 |
for sent in target_sentences:
|
109 |
### T5
|
110 |
+
t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
|
111 |
+
t5_keywords = [get_keybert_results_with_vectorizer(i, number_of_results = number_of_keywords) for i in t5_paraphrased]
|
112 |
t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
|
113 |
|
114 |
t5_paraphrasing_keywords.extend(t5_keywords)
|
|
|
127 |
st.error('Error running Extraction Pipeline. Please contact admin')
|
128 |
logger.error(traceback.format_exc())
|
129 |
|
|
|
130 |
|
131 |
+
st.title('Exhaustive Keyword Extraction with Paraphrasing')
|
132 |
+
with st.sidebar:
|
133 |
+
st.header('Overview')
|
134 |
+
st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')
|
135 |
+
|
136 |
+
st.header('Parameters')
|
137 |
+
number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
|
138 |
+
number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)
|
139 |
+
|
140 |
+
st.header('Specifications')
|
141 |
+
st.markdown('To generate context aware and OOV keywords, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')
|
142 |
+
|
143 |
+
doc = st.text_area("Enter a custom document")
|
144 |
if doc:
|
145 |
+
t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc, number_of_keywords, number_of_paraphrases)
|
146 |
|
147 |
# extract_paraphrased_article(input_list[0])
|
148 |
+
st.text(f'PIPELINE RUNTIME: {total_end}\n')
|
149 |
|
150 |
st.subheader('\nOriginal Keywords Extracted:\n\n')
|
151 |
st.dataframe(original_keywords_df)
|