m. polinsky
commited on
Commit
•
d828c62
1
Parent(s):
ff1ba85
Added refactored app code
Browse files- streamlit_app.py +185 -516
streamlit_app.py
CHANGED
@@ -1,563 +1,232 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
from transformers import pipeline, AutoModel, AutoTokenizer
|
4 |
-
|
5 |
-
import time
|
6 |
-
|
7 |
-
from time import time as t
|
8 |
-
|
9 |
-
from gazpacho import Soup, get
|
10 |
-
|
11 |
-
import tokenizers
|
12 |
-
|
13 |
-
import json
|
14 |
-
|
15 |
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
# FUNCTIONS #
|
20 |
-
|
21 |
-
#############
|
22 |
-
|
23 |
-
ex = []
|
24 |
-
|
25 |
-
# Query the HuggingFace Inference engine.
|
26 |
-
|
27 |
-
def query(payload):
|
28 |
-
|
29 |
-
data = json.dumps(payload)
|
30 |
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
return json.loads(response.content.decode("utf-8"))
|
34 |
|
35 |
-
|
36 |
|
37 |
def ner_query(payload):
|
38 |
-
|
39 |
data = json.dumps(payload)
|
40 |
-
|
41 |
response = requests.request("POST", NER_API_URL, headers=headers, data=data)
|
42 |
-
|
43 |
return json.loads(response.content.decode("utf-8"))
|
44 |
|
45 |
-
# gets links and identifies if they're cnn or npr
|
46 |
|
47 |
-
def get_articles(user_choices, cnn_dict, npr_dict):
|
48 |
-
|
49 |
-
clustLinks = []
|
50 |
-
|
51 |
-
heds = {}
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
for each in user_choices:
|
58 |
-
|
59 |
-
for beach in clusters[each.lower()]:
|
60 |
-
|
61 |
-
if beach not in heds:
|
62 |
-
|
63 |
-
heds[beach] = 1
|
64 |
-
|
65 |
-
else:
|
66 |
-
|
67 |
-
heds[beach] += 1
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
# Convert keys (headlines) to list then sort in descending order of prevalence
|
72 |
-
|
73 |
-
sorted_heds = list(heds.keys())
|
74 |
-
|
75 |
-
sorted_heds.sort(key=lambda b: heds[b], reverse=True)
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
for each in sorted_heds:
|
80 |
-
|
81 |
-
try:
|
82 |
-
|
83 |
-
# look up the headline in cnn
|
84 |
-
|
85 |
-
clustLinks.append(('cnn',cnn_dict[each]))
|
86 |
-
|
87 |
-
# if exception KeyError then lookup in npr
|
88 |
-
|
89 |
-
except KeyError:
|
90 |
-
|
91 |
-
clustLinks.append(('npr',npr_dict[each]))
|
92 |
-
|
93 |
-
return clustLinks
|
94 |
-
|
95 |
-
# gets articles from source via scraping
|
96 |
-
|
97 |
-
def retrieve(input_reslist):
|
98 |
-
|
99 |
-
cnn = 'https://lite.cnn.com'
|
100 |
-
|
101 |
-
npr = 'https://text.npr.org'
|
102 |
-
|
103 |
-
articles = []
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
# Scrapes from npr or cnn. Should modularize this and use a dict as a switch-case
|
108 |
-
|
109 |
-
for each in input_reslist:
|
110 |
-
|
111 |
-
if each[0] == 'npr':
|
112 |
-
|
113 |
-
container = Soup(get(npr+each[1])).find('div', {'class': "paragraphs-container"}).find('p')
|
114 |
-
|
115 |
-
articles.append(container)
|
116 |
-
|
117 |
-
if each[0] == 'cnn':
|
118 |
-
|
119 |
-
container = Soup(get(cnn+each[1])).find('div', {'class': 'afe4286c'})
|
120 |
-
|
121 |
-
# Extract all text from paragraph tags, each extracted from container
|
122 |
-
|
123 |
-
#story = '\n'.join([x.text for x in container.find('p') if x.text != ''])
|
124 |
-
|
125 |
-
story = container.find('p')
|
126 |
-
|
127 |
-
articles.append(story[4:])
|
128 |
-
|
129 |
-
time.sleep(1)
|
130 |
-
|
131 |
-
return articles
|
132 |
-
|
133 |
-
# Returns a list of articles
|
134 |
-
|
135 |
-
# Takes list of articles and assigns each articles' text to an int for some reason....
|
136 |
-
|
137 |
-
#
|
138 |
-
|
139 |
-
## *** Dictionary might shuffle articles?
|
140 |
-
|
141 |
-
#
|
142 |
-
|
143 |
-
def art_prep(retrieved):
|
144 |
-
|
145 |
-
a = []
|
146 |
-
|
147 |
-
for each in retrieved:
|
148 |
-
|
149 |
-
if type(each) is not list:
|
150 |
-
|
151 |
-
a.append(each.strip())
|
152 |
-
|
153 |
-
else:
|
154 |
-
|
155 |
-
a.append(''.join([art.strip() for art in each]))
|
156 |
-
|
157 |
-
return a
|
158 |
-
|
159 |
-
# User choices is the list of user-chosen entities.
|
160 |
-
|
161 |
-
def seek_and_sum(user_choices, cnn_dict, npr_dict):
|
162 |
-
|
163 |
-
# If no topics are selected return nothing
|
164 |
-
|
165 |
-
if len(user_choices) == 0:
|
166 |
-
|
167 |
-
return []
|
168 |
-
|
169 |
-
digs = []
|
170 |
-
|
171 |
-
prepped=art_prep(retrieve(get_articles(user_choices, cnn_dict, npr_dict)))
|
172 |
-
|
173 |
-
# Final is the output...the digest.
|
174 |
-
|
175 |
-
for piece in prepped:
|
176 |
-
|
177 |
-
digs.append(create_summaries(piece, 'sshleifer/distilbart-cnn-12-6'))
|
178 |
-
|
179 |
-
# Opportunity for processing here
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
return digs
|
184 |
-
|
185 |
-
# Chunks
|
186 |
-
|
187 |
-
def chunk_piece(piece, limit):
|
188 |
-
|
189 |
-
words = len(piece.split(' ')) # rough estimate of words. # words <= number tokens generally.
|
190 |
-
|
191 |
-
perchunk = words//limit
|
192 |
-
|
193 |
-
base_range = [i*limit for i in range(perchunk+1)]
|
194 |
-
|
195 |
-
range_list = [i for i in zip(base_range,base_range[1:])]
|
196 |
-
|
197 |
-
#range_list.append((range_list[-1][1],words)) try leaving off the end (or pad it?)
|
198 |
-
|
199 |
-
chunked_pieces = [' '.join(piece.split(' ')[i:j]).replace('\n','').replace('.','. ') for i,j in range_list]
|
200 |
-
|
201 |
-
return chunked_pieces
|
202 |
-
|
203 |
-
# Summarizes
|
204 |
-
|
205 |
-
def create_summaries(piece, chkpnt, lim=400):
|
206 |
-
|
207 |
-
tokenizer = AutoTokenizer.from_pretrained(chkpnt)
|
208 |
-
|
209 |
-
limit = lim
|
210 |
-
|
211 |
-
count = -1
|
212 |
-
|
213 |
-
summary = []
|
214 |
-
|
215 |
-
words = len(piece.split(' '))
|
216 |
-
|
217 |
-
if words >= limit:
|
218 |
-
|
219 |
-
# chunk the piece
|
220 |
-
|
221 |
-
#print(f'Chunking....')
|
222 |
-
|
223 |
-
proceed = False
|
224 |
-
|
225 |
-
while not proceed:
|
226 |
-
|
227 |
-
try:
|
228 |
-
|
229 |
-
chunked_pieces = chunk_piece(piece, limit)
|
230 |
-
|
231 |
-
for chunk in chunked_pieces:
|
232 |
-
|
233 |
-
token_length = len(tokenizer(chunk))
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
# Perform summarization
|
238 |
-
|
239 |
-
if token_length <= 512:
|
240 |
-
|
241 |
-
data = query({ "inputs": str(chunk), "parameters": {"do_sample": False} }) # The way I'm passing the chunk could be the problem? In a loop by ref?
|
242 |
-
|
243 |
-
summary.append(data[0]['summary_text'])
|
244 |
-
|
245 |
-
proceed = True
|
246 |
-
|
247 |
-
else:
|
248 |
-
|
249 |
-
proceed = False
|
250 |
-
|
251 |
-
limit -= 2 # Try to back off as little as possible.
|
252 |
-
|
253 |
-
summary = [] # empty summary we're starting again.
|
254 |
-
|
255 |
-
except IndexError: # Caused when 400 words get tokenized to > 512 tokens. Rare.
|
256 |
-
|
257 |
-
proceed = False
|
258 |
-
|
259 |
-
# lower the limit
|
260 |
-
|
261 |
-
limit -= 2 # Try to back off as little as possible.
|
262 |
-
|
263 |
-
summary = [] # empty summary we're starting again.
|
264 |
-
|
265 |
-
days_summary = ' '.join(summary) # Concatenate partial summaries
|
266 |
-
|
267 |
-
else:
|
268 |
-
|
269 |
-
#print(f'Summarizing whole piece')
|
270 |
-
|
271 |
-
proceed = False
|
272 |
-
|
273 |
-
while not proceed:
|
274 |
-
|
275 |
-
try:
|
276 |
-
|
277 |
-
# Perform summarization
|
278 |
-
|
279 |
-
data = query({ "inputs": str(piece), "parameters": {"do_sample": False} })
|
280 |
-
|
281 |
-
days_summary = data[0]['summary_text']
|
282 |
-
|
283 |
-
proceed= True
|
284 |
-
|
285 |
-
except IndexError:
|
286 |
-
|
287 |
-
proceed = False
|
288 |
-
|
289 |
-
piece = piece[:-4]
|
290 |
-
|
291 |
-
days_summary = ''
|
292 |
-
|
293 |
-
return days_summary
|
294 |
-
|
295 |
-
# This function creates a nice output from the dictionary the NER pipeline returns.
|
296 |
-
|
297 |
-
# It works for grouped_entities = True or False.
|
298 |
-
|
299 |
-
def ner_results(ner_object, indent=False, groups=True, NER_THRESHOLD=0.5):
|
300 |
-
|
301 |
# empty lists to collect our entities
|
302 |
-
|
303 |
people, places, orgs, misc = [], [], [], []
|
304 |
|
305 |
# 'ent' and 'designation' handle the difference between dictionary keys
|
306 |
-
|
307 |
# for aggregation strategy grouped vs ungrouped
|
308 |
-
|
309 |
ent = 'entity' if not groups else 'entity_group'
|
310 |
-
|
311 |
designation = 'I-' if not groups else ''
|
312 |
|
313 |
# Define actions -- this is a switch-case dictionary.
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
315 |
actions = {designation+'PER':people.append,
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
#
|
326 |
-
|
|
|
327 |
readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
|
328 |
|
329 |
# create list of all entities to return
|
330 |
-
|
331 |
ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
|
332 |
|
333 |
return ner_list
|
334 |
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
def create_ner_dicts(state=True):
|
340 |
-
|
341 |
-
# Changing this will run the method again, refreshing the topics
|
342 |
-
|
343 |
-
status = state
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
url1 = 'https://lite.cnn.com/en'
|
348 |
-
|
349 |
-
soup_cnn = Soup(get(url1))
|
350 |
-
|
351 |
-
# extract each headline from the div containing the links.
|
352 |
-
|
353 |
-
cnn_text = [i.text for i in soup_cnn.find('div', {'class': 'afe4286c'}).find('a')]
|
354 |
-
|
355 |
-
cnn_links = [i.attrs['href'] for i in soup_cnn.find('div', {'class': 'afe4286c'}).find('a')]
|
356 |
-
|
357 |
-
cnn = [i for i in cnn_text if 'Analysis:' not in i and 'Opinion:' not in i]
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
# Get current links...in the future you'll have to check for overlaps.
|
364 |
-
|
365 |
-
url2 = 'https://text.npr.org/1001'
|
366 |
-
|
367 |
-
soup = Soup(get(url2))
|
368 |
-
|
369 |
-
# extract each headline
|
370 |
-
|
371 |
-
npr_text = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
372 |
-
|
373 |
-
npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
374 |
-
|
375 |
-
npr = [i for i in npr_text if 'Opinion:' not in i]
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
cnn_dict = {k[0]:k[1] for k in zip(cnn_text,cnn_links)}
|
380 |
-
|
381 |
-
npr_dict = {k[0]:k[1] for k in zip(npr_text,npr_links)}
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
# START Perform NER
|
386 |
-
|
387 |
-
cnn_ner = {x:ner_results(ner_query(x)) for x in cnn} ###################################################################################################
|
388 |
-
|
389 |
-
npr_ner = {x:ner_results(ner_query(x)) for x in npr} ################################# ################################# #################################
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
return cnn_dict, npr_dict, cnn_ner, npr_ner
|
394 |
-
|
395 |
-
## A function to change a state variable in create_dicts() above
|
396 |
-
|
397 |
-
## that then runs it and creates updated clusters.
|
398 |
-
|
399 |
-
def get_news_topics(cnn_ner, npr_ner):
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
## END Perform NER
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
# Select from articles.
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
## Select from articles that are clusterable only. (Entities were recognized.)
|
412 |
-
|
413 |
-
cnn_final = {x:npr_ner[x] for x in npr_ner.keys() if len(npr_ner[x]) != 0}
|
414 |
-
|
415 |
-
npr_final = {y:cnn_ner[y] for y in cnn_ner.keys() if len(cnn_ner[y]) != 0 }
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
# What's in the news?
|
420 |
-
|
421 |
-
# Get entities named in the pool of articles we're drawing from
|
422 |
-
|
423 |
-
e_list = []
|
424 |
-
|
425 |
-
for i in [i for i in cnn_final.values()]:
|
426 |
-
|
427 |
-
for j in i:
|
428 |
-
|
429 |
-
e_list.append(j)
|
430 |
-
|
431 |
-
for k in [k for k in npr_final.values()]:
|
432 |
-
|
433 |
-
for j in k:
|
434 |
-
|
435 |
-
e_list.append(j)
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
# This is a dictionary with keys: the list items....
|
440 |
-
|
441 |
-
clusters = {k.lower():[] for k in e_list}
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
## Perform Clustering
|
446 |
-
|
447 |
-
for hed in cnn_final.keys():
|
448 |
-
|
449 |
-
for item in cnn_final[hed]:
|
450 |
-
|
451 |
-
clusters[item.lower()].append(hed) # placing the headline in the list corresponding to the dictionary key for each entity.
|
452 |
-
|
453 |
-
for hed in npr_final.keys():
|
454 |
-
|
455 |
-
for item in npr_final[hed]:
|
456 |
-
|
457 |
-
clusters[item.lower()].append(hed)
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
return clusters
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
def update_topics():
|
468 |
-
|
469 |
-
st.legacy_caching.clear_cache()
|
470 |
-
|
471 |
-
dicts = [i for i in create_ner_dicts()]
|
472 |
-
|
473 |
-
clusters = get_news_topics(cnn_ner, npr_ner)
|
474 |
-
|
475 |
-
return clusters, dicts
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
#############
|
482 |
-
|
483 |
-
# SETUP #
|
484 |
-
|
485 |
-
#############
|
486 |
-
|
487 |
-
# Auth for HF Inference API and URL to the model we're using -- distilbart-cnn-12-6
|
488 |
-
|
489 |
-
headers = {"Authorization": f"""Bearer {st.secrets["ato"]}"""}
|
490 |
-
|
491 |
-
API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
|
492 |
-
|
493 |
NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
|
507 |
selections = []
|
508 |
-
|
509 |
-
choices
|
510 |
-
|
511 |
-
for i in list(clusters.keys()):
|
512 |
-
|
513 |
-
choices.append(i)
|
514 |
-
|
515 |
-
# button to refresh topics
|
516 |
-
|
517 |
-
if st.button("Refresh topics!"):
|
518 |
-
|
519 |
-
new_data = update_topics()
|
520 |
-
|
521 |
-
clusters = new_data[0]
|
522 |
-
|
523 |
-
cnn_dict, npr_dict, cnn_ner, npr_ner = new_data[1][0], new_data[1][1], new_data[1][2], new_data[1][3]
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
# Form used to take 3 menu inputs
|
528 |
-
|
529 |
with st.form(key='columns_in_form'):
|
530 |
-
|
531 |
cols = st.columns(3)
|
532 |
-
|
533 |
for i, col in enumerate(cols):
|
534 |
-
|
535 |
selections.append(col.selectbox(f'Make a Selection', choices, key=i))
|
536 |
-
|
537 |
submitted = st.form_submit_button('Submit')
|
538 |
-
|
539 |
if submitted:
|
540 |
-
|
541 |
selections = [i for i in selections if i is not None]
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
if len(digest) == 0:
|
548 |
-
|
549 |
st.write("You didn't select a topic!")
|
550 |
-
|
551 |
else:
|
552 |
-
|
553 |
st.write("Your digest is ready:\n")
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
count = 0
|
558 |
-
|
559 |
-
for each in digest:
|
560 |
-
|
561 |
-
count += 1
|
562 |
-
|
563 |
-
st.write(each)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
+
import json
|
3 |
+
from typing import List, Set
|
4 |
+
from collections import namedtuple
|
5 |
+
from functools import lru_cache
|
6 |
+
from datetime import datetime as dt
|
7 |
+
import os, os.path
|
8 |
|
9 |
+
from codetiming import Timer
|
10 |
+
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
# local code
|
13 |
+
from digestor import Digestor
|
14 |
+
from source import Source
|
15 |
+
from scrape_sources import NPRLite, CNNText, stub
|
16 |
+
import random
|
17 |
+
|
18 |
+
# EDIT: before doing NER check time of last scrape and just read in from JSON store instead of rescraping
|
19 |
+
# can force rescrape
|
20 |
+
# This may take a config to get sources as input
|
21 |
+
|
22 |
+
def initialize(limit, rando, use_cache=True):
|
23 |
+
clusters: dict[str:List[namedtuple]] = dict()
|
24 |
+
# This is a container for the source classes.
|
25 |
+
# Make sure you handle this. Whats the deal.
|
26 |
+
sources:List[Source]= [] # Write them and import? Read a config?
|
27 |
+
# FOR NOW ONLY add this explicitly here.
|
28 |
+
# MUST read in final version though.
|
29 |
+
sources.append(NPRLite(
|
30 |
+
'npr',
|
31 |
+
'https://text.npr.org/1001',
|
32 |
+
'sshleifer/distilbart-cnn-12-6',
|
33 |
+
'dbmdz/bert-large-cased-finetuned-conll03-english'
|
34 |
+
))
|
35 |
+
sources.append(CNNText(
|
36 |
+
'cnn',
|
37 |
+
'https://lite.cnn.com',
|
38 |
+
'sshleifer/distilbart-cnn-12-6',
|
39 |
+
'dbmdz/bert-large-cased-finetuned-conll03-english'
|
40 |
+
))
|
41 |
+
|
42 |
+
|
43 |
+
# initialize list to hold cluster data namedtuples
|
44 |
+
cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])]
|
45 |
+
article_dict : dict[str:namedtuple]
|
46 |
+
|
47 |
+
# For all sources retrieve_cluster_data
|
48 |
+
# returns List[namedtuples] with empty entity lists
|
49 |
+
# TEST THIS ALL V V V
|
50 |
+
cluster_data = []
|
51 |
+
article_meta = namedtuple('article_meta',['source', 'count'])
|
52 |
+
cluster_meta : List[article_meta] = []
|
53 |
+
print("Calling data source retrieve cluster data....")
|
54 |
+
for data_source in sources:
|
55 |
+
if limit is not None:
|
56 |
+
c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources))
|
57 |
+
else:
|
58 |
+
c_data, c_meta = data_source.retrieve_cluster_data()
|
59 |
+
cluster_data.append(c_data)
|
60 |
+
cluster_meta.append(article_meta(data_source.source_name, c_meta))
|
61 |
+
print("Finished...moving on to clustering...")
|
62 |
+
cluster_data = cluster_data[0] + cluster_data[1]
|
63 |
+
# NER
|
64 |
+
# iterate the list of namedtuples,
|
65 |
+
for tup in cluster_data:
|
66 |
+
# pass each hed to the api query method, return the dict
|
67 |
+
# through the ner_results function to the 'entities' list.
|
68 |
+
# Populate stub entities list
|
69 |
+
perform_ner(tup, cache=use_cache)
|
70 |
+
generate_clusters(clusters, tup)
|
71 |
+
st.write(f"""Total number of clusters: {len(clusters)}""")
|
72 |
+
|
73 |
+
# Article stubs tracks all stubs
|
74 |
+
# If cluster is unsummarized, its hed's value is the namedtuple stub.
|
75 |
+
# Else reference digestor instance so summary can be found.
|
76 |
+
article_dict = {stub.hed: stub for stub in cluster_data}
|
77 |
+
|
78 |
+
return article_dict, clusters
|
79 |
+
|
80 |
+
|
81 |
+
# Am I going to use this for those two lines?
|
82 |
+
def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True):
|
83 |
+
with Timer(name="ner_query_time", logger=None):
|
84 |
+
result = ner_results(ner_query(
|
85 |
+
{
|
86 |
+
"inputs":tup.hed,
|
87 |
+
"paramters":
|
88 |
+
{
|
89 |
+
"use_cache": cache,
|
90 |
+
},
|
91 |
+
}
|
92 |
+
))
|
93 |
+
for i in result:
|
94 |
+
tup.entities.append(i)
|
95 |
|
|
|
96 |
|
|
|
97 |
|
98 |
def ner_query(payload):
|
99 |
+
print("making a query....")
|
100 |
data = json.dumps(payload)
|
|
|
101 |
response = requests.request("POST", NER_API_URL, headers=headers, data=data)
|
|
|
102 |
return json.loads(response.content.decode("utf-8"))
|
103 |
|
|
|
104 |
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
def generate_clusters(
|
107 |
+
the_dict: dict,
|
108 |
+
tup : namedtuple('article_stub',[ 'link','hed','entities', 'source'])
|
109 |
+
) -> dict:
|
110 |
+
for entity in tup.entities:
|
111 |
+
# Add cluster if entity not already in dict
|
112 |
+
if entity not in the_dict:
|
113 |
+
the_dict[entity] = []
|
114 |
+
# Add this article's link to the cluster dict
|
115 |
+
the_dict[entity].append(tup)
|
116 |
|
117 |
|
118 |
+
def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
# empty lists to collect our entities
|
|
|
120 |
people, places, orgs, misc = [], [], [], []
|
121 |
|
122 |
# 'ent' and 'designation' handle the difference between dictionary keys
|
|
|
123 |
# for aggregation strategy grouped vs ungrouped
|
|
|
124 |
ent = 'entity' if not groups else 'entity_group'
|
|
|
125 |
designation = 'I-' if not groups else ''
|
126 |
|
127 |
# Define actions -- this is a switch-case dictionary.
|
128 |
+
# keys are the identifiers used inthe return dict from
|
129 |
+
# the ner_query.
|
130 |
+
# values are list.append() for each of the lists
|
131 |
+
# created at the top of the function. They hold sorted entities.
|
132 |
+
# actions is used to pass entities into the lists.
|
133 |
+
# Why I called it actions I have no idea rename it.
|
134 |
actions = {designation+'PER':people.append,
|
135 |
+
designation+'LOC':places.append,
|
136 |
+
designation+'ORG':orgs.append,
|
137 |
+
designation+'MISC':misc.append
|
138 |
+
} # Is this an antipattern?
|
139 |
+
|
140 |
+
# For each dictionary in the ner result list, if the entity str doesn't contain a '#'
|
141 |
+
# and the confidence is > 90%, add the entity to the list for its type.
|
142 |
+
|
143 |
+
# actions[d[ent]](d['word']) accesses the key of actions that is returned
|
144 |
+
# from d[ent] and then passes the entity name, returned by d['word'] to
|
145 |
+
# the 'list.append' waiting to be called in the dict actions.
|
146 |
+
# Note the (). We access actions to call its append...
|
147 |
readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
|
148 |
|
149 |
# create list of all entities to return
|
|
|
150 |
ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
|
151 |
|
152 |
return ner_list
|
153 |
|
154 |
+
# These could be passed through the command line
|
155 |
+
# or read from a config file.
|
156 |
+
# One of these is needed here for NER and one in Digestor for summarization.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
|
158 |
+
headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""}
|
159 |
+
|
160 |
+
LIMIT = None # Controls time and number of clusters.
|
161 |
+
USE_CACHE = True
|
162 |
+
|
163 |
+
if not USE_CACHE:
|
164 |
+
print("NOT USING CACHE--ARE YOU GATHERING DATA?")
|
165 |
+
if LIMIT is not None:
|
166 |
+
print(f"LIMIT: {LIMIT}")
|
167 |
+
|
168 |
+
# digest store
|
169 |
+
digests = dict() # key is cluster, value is digestor object
|
170 |
+
out_dicts = []
|
171 |
+
# list to accept user choices
|
172 |
+
# retrieve cluster data and create dict to track each article (articleStubs)
|
173 |
+
# and create topic clusters by performing ner.
|
174 |
+
print("Initializing....")
|
175 |
+
article_dict, clusters = initialize(LIMIT, USE_CACHE)
|
176 |
+
# We now have clusters and cluster data. Redundancy.
|
177 |
+
# We call a display function and get the user input.
|
178 |
+
# For this its still streamlit.
|
179 |
|
180 |
selections = []
|
181 |
+
choices = list(clusters.keys())
|
182 |
+
choices.insert(0,'None')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
# Form used to take 3 menu inputs
|
|
|
184 |
with st.form(key='columns_in_form'):
|
|
|
185 |
cols = st.columns(3)
|
|
|
186 |
for i, col in enumerate(cols):
|
|
|
187 |
selections.append(col.selectbox(f'Make a Selection', choices, key=i))
|
|
|
188 |
submitted = st.form_submit_button('Submit')
|
|
|
189 |
if submitted:
|
|
|
190 |
selections = [i for i in selections if i is not None]
|
191 |
+
with st.spinner(text="Digesting...please wait, this will take a few moments...Maybe check some messages or start reading the latest papers on summarization with transformers...."):
|
192 |
+
found = False
|
193 |
+
# Check if we already have this digest.
|
194 |
+
for i in digests:
|
195 |
+
if set(list(answers.values())) == set(list(i)):
|
196 |
+
digestor = digests[i]
|
197 |
+
found = True
|
198 |
+
break
|
199 |
+
|
200 |
+
# If we need a new digest
|
201 |
+
if not found:
|
202 |
+
chosen = []
|
203 |
+
# Why not just use answers.values()?
|
204 |
+
for i in selections: # i is supposed to be a list of stubs, mostly one
|
205 |
+
if i != 'None':
|
206 |
+
for j in clusters[i]:
|
207 |
+
if j not in chosen:
|
208 |
+
chosen.append(j) # j is supposed to be a stub.
|
209 |
+
|
210 |
+
# Article dict contains stubs for unprocessed articles and lists of summarized chunks for processed ones.
|
211 |
+
# Here we put together a list of article stubs and/or summary chunks and let the digestor sort out what it does with them,
|
212 |
+
chosen = [i if isinstance(article_dict[i.hed], stub) else article_dict[i.hed] for i in chosen]
|
213 |
+
# Digestor uses 'chosen', passed through 'stubs' to create digest.
|
214 |
+
# 'user_choicese' is passed for reference.
|
215 |
+
# Passing list(answers.values()) includes 'None' choices.
|
216 |
+
digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=list(selections))
|
217 |
+
# happens internally but may be used differently so it isn't automatic upon digestor creation.
|
218 |
+
# Easily turn caching off for testing.
|
219 |
+
digestor.digest() # creates summaries and stores them associated with the digest
|
220 |
+
|
221 |
+
|
222 |
+
|
223 |
+
# Get displayable digest and digest data
|
224 |
+
digestor.build_digest()# only returns for data collection
|
225 |
+
|
226 |
+
digest = digestor.text
|
227 |
if len(digest) == 0:
|
|
|
228 |
st.write("You didn't select a topic!")
|
|
|
229 |
else:
|
|
|
230 |
st.write("Your digest is ready:\n")
|
231 |
+
|
232 |
+
st.write(digest)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|