ck46 commited on
Commit
dc66d85
1 Parent(s): 62a08f4

initial application

Browse files
Files changed (3) hide show
  1. app.py +39 -0
  2. paraphraser.py +189 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from paraphraser import get_key_sentences, ParaphraseModel
4
+
5
+ paraphraser = ParaphraseModel()
6
+
7
+
8
+ # Add a model selector to the sidebar:
9
+ model = st.sidebar.selectbox(
10
+ 'Select Model',
11
+ ('T5-base', 'DistilT5-base', 'T5-small')
12
+ )
13
+
14
+
15
+ top_k = st.sidebar.slider('Top_K', 100, 300, 168)
16
+
17
+ top_p = st.sidebar.slider('Top_P', 0.0, 1.0, 0.95)
18
+
19
+ st.header("Bullet-point Summarization")
20
+ st.write(f'Model in use: {model}')
21
+
22
+ txt = st.text_area('Text to analyze', )
23
+
24
+ if len(txt) >= 1:
25
+ key_sentences = get_key_sentences(txt)
26
+ sentences = []
27
+ for i in sorted(key_sentences):
28
+ sentences.append(key_sentences[i])
29
+
30
+ paraphrased_sentences = paraphraser(sentences, top_k=top_k, top_p=top_p, num_sequences=1)
31
+ else:
32
+ sentences = []
33
+ paraphrased_sentences = []
34
+
35
+ st.header('Extracted Key Sentences')
36
+ st.write(sentences)
37
+
38
+ st.header('Paraphrase results')
39
+ st.write(paraphrased_sentences)
paraphraser.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+ import itertools
4
+ import torch
5
+
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+ from sentence_transformers import SentenceTransformer
8
+ from sklearn.feature_extraction.text import CountVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+
11
+
12
+ class KeywordExtraction:
13
+ def __init__(self, n_gram_range=(1, 1), stop_words='english', model_name='distilbert-base-nli-mean-tokens'):
14
+ self.n_gram_range = n_gram_range
15
+ self.stop_words = stop_words
16
+ self.model_name = model_name
17
+ self.model = SentenceTransformer(self.model_name)
18
+
19
+ def __call__(self, doc, top_n=5, diversity=('mmr', 0.7)):
20
+ doc_embedding = self.get_document_embeddings(doc)
21
+ candidates = self.get_candidates(doc)
22
+ candidate_embeddings = self.get_candidate_embeddings(candidates)
23
+ try:
24
+ if diversity[0] == 'mmr':
25
+ # print('using maximal marginal relevance method...')
26
+ return self.maximal_marginal_relevance(doc_embedding,
27
+ candidate_embeddings,
28
+ candidates,
29
+ top_n=top_n,
30
+ diversity=diversity[1])
31
+ elif diversity[0] == 'mss':
32
+ # print('using max sum similarity method...')
33
+ return self.max_sum_similarity(doc_embedding,
34
+ candidate_embeddings,
35
+ candidates,
36
+ top_n=top_n,
37
+ nr_candidates=diversity[1])
38
+ else:
39
+ # print('using default method...')
40
+ return self.get_keywords(doc_embedding, candidate_embeddings, candidates, top_n)
41
+ except Exception as e:
42
+ print(e)
43
+
44
+ def get_candidates(self, doc):
45
+ # Extract candidate words/phrases
46
+ count = CountVectorizer(ngram_range=self.n_gram_range, stop_words=self.stop_words).fit([doc])
47
+ return count.get_feature_names_out()
48
+
49
+ def get_candidate_embeddings(self, candidates):
50
+ return self.model.encode(candidates)
51
+
52
+ def get_document_embeddings(self, doc):
53
+ return self.model.encode([doc])
54
+
55
+ def get_keywords(self, doc_embedding, candidate_embeddings, candidates, top_n=5):
56
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
57
+ keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
58
+ return keywords
59
+
60
+ def max_sum_similarity(self, doc_embedding, candidate_embeddings, candidates, top_n, nr_candidates):
61
+ # Calculate distances and extract keywords
62
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
63
+ distances_candidates = cosine_similarity(candidate_embeddings,
64
+ candidate_embeddings)
65
+
66
+ # Get top_n words as candidates based on cosine similarity
67
+ words_idx = list(distances.argsort()[0][-nr_candidates:])
68
+ words_vals = [candidates[index] for index in words_idx]
69
+ distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]
70
+
71
+ # Calculate the combination of words that are the least similar to each other
72
+ min_sim = np.inf
73
+ candidate = None
74
+ for combination in itertools.combinations(range(len(words_idx)), top_n):
75
+ sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
76
+ if sim < min_sim:
77
+ candidate = combination
78
+ min_sim = sim
79
+
80
+ return [words_vals[idx] for idx in candidate]
81
+
82
+ def maximal_marginal_relevance(self, doc_embedding, word_embeddings, words, top_n, diversity):
83
+ # Extract similarity within words, and between words and the document
84
+ word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
85
+ word_similarity = cosine_similarity(word_embeddings)
86
+
87
+ # Initialize candidates and already choose best keyword/keyphras
88
+ keywords_idx = [np.argmax(word_doc_similarity)]
89
+ candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
90
+
91
+ for _ in range(top_n - 1):
92
+ # Extract similarities within candidates and
93
+ # between candidates and selected keywords/phrases
94
+ candidate_similarities = word_doc_similarity[candidates_idx, :]
95
+ target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
96
+
97
+ # Calculate MMR
98
+ mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
99
+ mmr_idx = candidates_idx[np.argmax(mmr)]
100
+
101
+ # Update keywords & candidates
102
+ keywords_idx.append(mmr_idx)
103
+ candidates_idx.remove(mmr_idx)
104
+
105
+ return [words[idx] for idx in keywords_idx]
106
+
107
+
108
+ def regex(phrase, m=0, n=3):
109
+ strng = "([\s]*[a-zA-Z0-9]*[\s]*){%d,%d}" % (m,n)
110
+ return strng.join(phrase.split())
111
+
112
+ def remove_square_brackets(text):
113
+ return re.sub('\[[0-9]+\]', '', text)
114
+
115
+ def remove_extra_spaces(text):
116
+ return re.sub('[\s]{2,}', ' ', text)
117
+
118
+
119
+ def preprocess_text(text):
120
+ text = re.sub('\[[0-9]+\]', '', text)
121
+ text = re.sub('[\s]{2,}', ' ', text)
122
+ text = text.strip()
123
+ return text
124
+
125
+ def sent_tokenize(text):
126
+ sents = text.split('.')
127
+ sents = [s.strip() for s in sents if len(s)>0]
128
+ return sents
129
+
130
+ def get_key_sentences(text, top_n=5, diversity=('mmr', 0.6)):
131
+ kw_extractor = KeywordExtraction(n_gram_range=(1,3))
132
+ text = preprocess_text(text)
133
+ sentences = sent_tokenize(text)
134
+ key_phrases = kw_extractor(text, top_n=top_n, diversity=diversity)
135
+
136
+ if key_phrases is None:
137
+ return None
138
+
139
+ key_sents = dict()
140
+ for phrase in key_phrases:
141
+ found = False
142
+ for i, sent in enumerate(sentences):
143
+ if re.search(regex(phrase), sent):
144
+ found = True
145
+ if i not in key_sents:
146
+ key_sents[i] = sent
147
+ if not found:
148
+ print(f'The phrase "{phrase}" was not matched!')
149
+ return key_sents
150
+
151
+
152
+ class ParaphraseModel:
153
+ def __init__(self, model_name="Vamsi/T5_Paraphrase_Paws"):
154
+ self.model_name = model_name
155
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
156
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
157
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
158
+
159
+ def __call__(self, inputs, top_k=200, top_p=0.95, num_sequences=5):
160
+ text = self.prepare_list_input(inputs) if type(inputs) == type([]) else f"paraphrase: {inputs} </s>"
161
+
162
+ encoding = self.tokenizer.batch_encode_plus(text, pad_to_max_length=True, return_tensors="pt")
163
+
164
+ input_ids = encoding["input_ids"].to(self.device)
165
+ attention_masks = encoding["attention_mask"].to(self.device)
166
+
167
+ outputs = self.model.generate(
168
+ input_ids=input_ids, attention_mask=attention_masks,
169
+ max_length=256,
170
+ do_sample=True,
171
+ top_k=top_k,
172
+ top_p=top_p,
173
+ early_stopping=True,
174
+ num_return_sequences=num_sequences
175
+ )
176
+
177
+ lines = []
178
+ for output in outputs:
179
+ line = self.tokenizer.decode(output,
180
+ skip_special_tokens=True,
181
+ clean_up_tokenization_spaces=True)
182
+ lines.append(line)
183
+ return lines
184
+
185
+ def prepare_list_input(self, lst):
186
+ sentences = []
187
+ for sent in lst:
188
+ sentences.append(f"paraphrase: {sent} </s>")
189
+ return sentences
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ pytorch
3
+ transformers
4
+ sentence-transformers
5
+ sklearn