numBery commited on
Commit
6ed92b7
1 Parent(s): 34f4c1d

added parameter sliders, added ONXX optimization for T5

Browse files
Files changed (1) hide show
  1. app.py +25 -12
app.py CHANGED
@@ -7,6 +7,7 @@ from keybert import KeyBERT
7
  from sentence_transformers import SentenceTransformer
8
  from keyphrase_vectorizers import KeyphraseCountVectorizer
9
  from transformers import T5ForConditionalGeneration,T5Tokenizer
 
10
 
11
  import nltk
12
  from nltk.tokenize import sent_tokenize
@@ -17,26 +18,26 @@ import streamlit as st
17
  import traceback
18
  import logging
19
 
20
-
21
- nltk.download('stopwords')
22
- nltk.download('punkt')
23
-
24
  logger = logging.getLogger(__name__)
25
 
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
 
28
  HfFolder.save_token(st.secrets["hf-auth-token"])
29
-
30
 
31
  @st.cache(allow_output_mutation=True)
32
  def load_model():
33
  try:
 
 
34
  # Load KeyBert Model
35
  tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
36
  kw_extractor = KeyBERT(tmp_model)
37
 
38
  # Load T5 for Paraphrasing
39
- t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
 
 
40
  t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
41
  t5_model = t5_model.to(device)
42
  return kw_extractor, t5_model, t5_tokenizer
@@ -90,7 +91,7 @@ def t5_paraphraser(text, number_of_results=5):
90
 
91
 
92
  #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
93
- def extract_paraphrased_sentences(article):
94
  try:
95
  start1 = time.time()
96
  with st.spinner('Extraction Keywords from Original Document...'):
@@ -106,8 +107,8 @@ def extract_paraphrased_sentences(article):
106
 
107
  for sent in target_sentences:
108
  ### T5
109
- t5_paraphrased = t5_paraphraser(sent)
110
- t5_keywords = [get_keybert_results_with_vectorizer(i) for i in t5_paraphrased]
111
  t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
112
 
113
  t5_paraphrasing_keywords.extend(t5_keywords)
@@ -126,13 +127,25 @@ def extract_paraphrased_sentences(article):
126
  st.error('Error running Extraction Pipeline. Please contact admin')
127
  logger.error(traceback.format_exc())
128
 
129
- doc = st.text_area("Enter a custom document")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  if doc:
132
- t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc)
133
 
134
  # extract_paraphrased_article(input_list[0])
135
- st.text(f'T5 PARAPHRASING RUNTIME: {total_end}\n')
136
 
137
  st.subheader('\nOriginal Keywords Extracted:\n\n')
138
  st.dataframe(original_keywords_df)
 
7
  from sentence_transformers import SentenceTransformer
8
  from keyphrase_vectorizers import KeyphraseCountVectorizer
9
  from transformers import T5ForConditionalGeneration,T5Tokenizer
10
+ from fastT5 import export_and_get_onnx_model, set_auth_token
11
 
12
  import nltk
13
  from nltk.tokenize import sent_tokenize
 
18
  import traceback
19
  import logging
20
 
 
 
 
 
21
  logger = logging.getLogger(__name__)
22
 
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
 
25
  HfFolder.save_token(st.secrets["hf-auth-token"])
26
+ set_auth_token(st.secrets["hf-auth-token"])
27
 
28
  @st.cache(allow_output_mutation=True)
29
  def load_model():
30
  try:
31
+ nltk.download('stopwords')
32
+ nltk.download('punkt')
33
  # Load KeyBert Model
34
  tmp_model = SentenceTransformer('valurank/MiniLM-L6-Keyword-Extraction', use_auth_token=True)
35
  kw_extractor = KeyBERT(tmp_model)
36
 
37
  # Load T5 for Paraphrasing
38
+
39
+ # t5_model = T5ForConditionalGeneration.from_pretrained('valurank/t5-paraphraser', use_auth_token=True)
40
+ t5_model = export_and_get_onnx_model('valurank/t5-paraphraser')
41
  t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
42
  t5_model = t5_model.to(device)
43
  return kw_extractor, t5_model, t5_tokenizer
 
91
 
92
 
93
  #### Extract Sentences with Keywords -> Paraphrase multiple versions -> Extract Keywords again
94
+ def extract_paraphrased_sentences(article, number_of_keywords, number_of_paraphrases):
95
  try:
96
  start1 = time.time()
97
  with st.spinner('Extraction Keywords from Original Document...'):
 
107
 
108
  for sent in target_sentences:
109
  ### T5
110
+ t5_paraphrased = t5_paraphraser(sent, number_of_results = number_of_paraphrases)
111
+ t5_keywords = [get_keybert_results_with_vectorizer(i, number_of_results = number_of_keywords) for i in t5_paraphrased]
112
  t5_keywords = [(word[0], word[1]) for s in t5_keywords for word in s]
113
 
114
  t5_paraphrasing_keywords.extend(t5_keywords)
 
127
  st.error('Error running Extraction Pipeline. Please contact admin')
128
  logger.error(traceback.format_exc())
129
 
 
130
 
131
+ st.title('Exhaustive Keyword Extraction with Paraphrasing')
132
+ with st.sidebar:
133
+ st.header('Overview')
134
+ st.markdown('This demo allows users to input text article and generate synonym-aware keywords. The pipeline includes the use of T5 Model for paraphrasing target sentences, and Sentence-transformers based Keyword Extraction')
135
+
136
+ st.header('Parameters')
137
+ number_of_keywords = st.slider('Number of Keywords to extract for each target sentence', min_value=5, max_value=50, step=5, value=20)
138
+ number_of_paraphrases = st.slider('Number of Paraphrased versions to generate for each target sentence', min_value=1, max_value=20, step=1, value=5)
139
+
140
+ st.header('Specifications')
141
+ st.markdown('To generate context aware and OOV keywords, we first run KeyBert for keyword extraction on the original article. The sentences which had Keywords are then passed through T5 for generating multiple paraphrased versions. These paraphrased sentences are then run through Keyword Extraction again to generate the final results')
142
+
143
+ doc = st.text_area("Enter a custom document")
144
  if doc:
145
+ t5_keywords_df, original_keywords_df, unique_keywords_df, total_end = extract_paraphrased_sentences(doc, number_of_keywords, number_of_paraphrases)
146
 
147
  # extract_paraphrased_article(input_list[0])
148
+ st.text(f'PIPELINE RUNTIME: {total_end}\n')
149
 
150
  st.subheader('\nOriginal Keywords Extracted:\n\n')
151
  st.dataframe(original_keywords_df)