m. polinsky commited on
Commit
d828c62
1 Parent(s): ff1ba85

Added refactored app code

Browse files
Files changed (1) hide show
  1. streamlit_app.py +185 -516
streamlit_app.py CHANGED
@@ -1,563 +1,232 @@
1
- import streamlit as st
2
-
3
- from transformers import pipeline, AutoModel, AutoTokenizer
4
-
5
- import time
6
-
7
- from time import time as t
8
-
9
- from gazpacho import Soup, get
10
-
11
- import tokenizers
12
-
13
- import json
14
-
15
  import requests
 
 
 
 
 
 
16
 
17
- #############
18
-
19
- # FUNCTIONS #
20
-
21
- #############
22
-
23
- ex = []
24
-
25
- # Query the HuggingFace Inference engine.
26
-
27
- def query(payload):
28
-
29
- data = json.dumps(payload)
30
 
31
- response = requests.request("POST", API_URL, headers=headers, data=data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- return json.loads(response.content.decode("utf-8"))
34
 
35
-
36
 
37
  def ner_query(payload):
38
-
39
  data = json.dumps(payload)
40
-
41
  response = requests.request("POST", NER_API_URL, headers=headers, data=data)
42
-
43
  return json.loads(response.content.decode("utf-8"))
44
 
45
- # gets links and identifies if they're cnn or npr
46
 
47
- def get_articles(user_choices, cnn_dict, npr_dict):
48
-
49
- clustLinks = []
50
-
51
- heds = {}
52
 
 
 
 
 
 
 
 
 
 
 
53
 
54
 
55
- # Get all headlines from each cluster -- add to dict and record number of clusters of interest the headline appeared in.
56
-
57
- for each in user_choices:
58
-
59
- for beach in clusters[each.lower()]:
60
-
61
- if beach not in heds:
62
-
63
- heds[beach] = 1
64
-
65
- else:
66
-
67
- heds[beach] += 1
68
-
69
-
70
-
71
- # Convert keys (headlines) to list then sort in descending order of prevalence
72
-
73
- sorted_heds = list(heds.keys())
74
-
75
- sorted_heds.sort(key=lambda b: heds[b], reverse=True)
76
-
77
-
78
-
79
- for each in sorted_heds:
80
-
81
- try:
82
-
83
- # look up the headline in cnn
84
-
85
- clustLinks.append(('cnn',cnn_dict[each]))
86
-
87
- # if exception KeyError then lookup in npr
88
-
89
- except KeyError:
90
-
91
- clustLinks.append(('npr',npr_dict[each]))
92
-
93
- return clustLinks
94
-
95
- # gets articles from source via scraping
96
-
97
- def retrieve(input_reslist):
98
-
99
- cnn = 'https://lite.cnn.com'
100
-
101
- npr = 'https://text.npr.org'
102
-
103
- articles = []
104
-
105
-
106
-
107
- # Scrapes from npr or cnn. Should modularize this and use a dict as a switch-case
108
-
109
- for each in input_reslist:
110
-
111
- if each[0] == 'npr':
112
-
113
- container = Soup(get(npr+each[1])).find('div', {'class': "paragraphs-container"}).find('p')
114
-
115
- articles.append(container)
116
-
117
- if each[0] == 'cnn':
118
-
119
- container = Soup(get(cnn+each[1])).find('div', {'class': 'afe4286c'})
120
-
121
- # Extract all text from paragraph tags, each extracted from container
122
-
123
- #story = '\n'.join([x.text for x in container.find('p') if x.text != ''])
124
-
125
- story = container.find('p')
126
-
127
- articles.append(story[4:])
128
-
129
- time.sleep(1)
130
-
131
- return articles
132
-
133
- # Returns a list of articles
134
-
135
- # Takes list of articles and assigns each articles' text to an int for some reason....
136
-
137
- #
138
-
139
- ## *** Dictionary might shuffle articles?
140
-
141
- #
142
-
143
- def art_prep(retrieved):
144
-
145
- a = []
146
-
147
- for each in retrieved:
148
-
149
- if type(each) is not list:
150
-
151
- a.append(each.strip())
152
-
153
- else:
154
-
155
- a.append(''.join([art.strip() for art in each]))
156
-
157
- return a
158
-
159
- # User choices is the list of user-chosen entities.
160
-
161
- def seek_and_sum(user_choices, cnn_dict, npr_dict):
162
-
163
- # If no topics are selected return nothing
164
-
165
- if len(user_choices) == 0:
166
-
167
- return []
168
-
169
- digs = []
170
-
171
- prepped=art_prep(retrieve(get_articles(user_choices, cnn_dict, npr_dict)))
172
-
173
- # Final is the output...the digest.
174
-
175
- for piece in prepped:
176
-
177
- digs.append(create_summaries(piece, 'sshleifer/distilbart-cnn-12-6'))
178
-
179
- # Opportunity for processing here
180
-
181
-
182
-
183
- return digs
184
-
185
- # Chunks
186
-
187
- def chunk_piece(piece, limit):
188
-
189
- words = len(piece.split(' ')) # rough estimate of words. # words <= number tokens generally.
190
-
191
- perchunk = words//limit
192
-
193
- base_range = [i*limit for i in range(perchunk+1)]
194
-
195
- range_list = [i for i in zip(base_range,base_range[1:])]
196
-
197
- #range_list.append((range_list[-1][1],words)) try leaving off the end (or pad it?)
198
-
199
- chunked_pieces = [' '.join(piece.split(' ')[i:j]).replace('\n','').replace('.','. ') for i,j in range_list]
200
-
201
- return chunked_pieces
202
-
203
- # Summarizes
204
-
205
- def create_summaries(piece, chkpnt, lim=400):
206
-
207
- tokenizer = AutoTokenizer.from_pretrained(chkpnt)
208
-
209
- limit = lim
210
-
211
- count = -1
212
-
213
- summary = []
214
-
215
- words = len(piece.split(' '))
216
-
217
- if words >= limit:
218
-
219
- # chunk the piece
220
-
221
- #print(f'Chunking....')
222
-
223
- proceed = False
224
-
225
- while not proceed:
226
-
227
- try:
228
-
229
- chunked_pieces = chunk_piece(piece, limit)
230
-
231
- for chunk in chunked_pieces:
232
-
233
- token_length = len(tokenizer(chunk))
234
-
235
-
236
-
237
- # Perform summarization
238
-
239
- if token_length <= 512:
240
-
241
- data = query({ "inputs": str(chunk), "parameters": {"do_sample": False} }) # The way I'm passing the chunk could be the problem? In a loop by ref?
242
-
243
- summary.append(data[0]['summary_text'])
244
-
245
- proceed = True
246
-
247
- else:
248
-
249
- proceed = False
250
-
251
- limit -= 2 # Try to back off as little as possible.
252
-
253
- summary = [] # empty summary we're starting again.
254
-
255
- except IndexError: # Caused when 400 words get tokenized to > 512 tokens. Rare.
256
-
257
- proceed = False
258
-
259
- # lower the limit
260
-
261
- limit -= 2 # Try to back off as little as possible.
262
-
263
- summary = [] # empty summary we're starting again.
264
-
265
- days_summary = ' '.join(summary) # Concatenate partial summaries
266
-
267
- else:
268
-
269
- #print(f'Summarizing whole piece')
270
-
271
- proceed = False
272
-
273
- while not proceed:
274
-
275
- try:
276
-
277
- # Perform summarization
278
-
279
- data = query({ "inputs": str(piece), "parameters": {"do_sample": False} })
280
-
281
- days_summary = data[0]['summary_text']
282
-
283
- proceed= True
284
-
285
- except IndexError:
286
-
287
- proceed = False
288
-
289
- piece = piece[:-4]
290
-
291
- days_summary = ''
292
-
293
- return days_summary
294
-
295
- # This function creates a nice output from the dictionary the NER pipeline returns.
296
-
297
- # It works for grouped_entities = True or False.
298
-
299
- def ner_results(ner_object, indent=False, groups=True, NER_THRESHOLD=0.5):
300
-
301
  # empty lists to collect our entities
302
-
303
  people, places, orgs, misc = [], [], [], []
304
 
305
  # 'ent' and 'designation' handle the difference between dictionary keys
306
-
307
  # for aggregation strategy grouped vs ungrouped
308
-
309
  ent = 'entity' if not groups else 'entity_group'
310
-
311
  designation = 'I-' if not groups else ''
312
 
313
  # Define actions -- this is a switch-case dictionary.
314
-
 
 
 
 
 
315
  actions = {designation+'PER':people.append,
316
-
317
- designation+'LOC':places.append,
318
-
319
- designation+'ORG':orgs.append,
320
-
321
- designation+'MISC':misc.append}
322
-
323
- # For each dictionary in the ner result list, if it doesn't contain a '#'
324
-
325
- # and the confidence is > 90%, add the entity name to the list for its type.
326
-
 
327
  readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
328
 
329
  # create list of all entities to return
330
-
331
  ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
332
 
333
  return ner_list
334
 
335
-
336
-
337
- @st.cache(hash_funcs={tokenizers.Tokenizer: id})
338
-
339
- def create_ner_dicts(state=True):
340
-
341
- # Changing this will run the method again, refreshing the topics
342
-
343
- status = state
344
-
345
-
346
-
347
- url1 = 'https://lite.cnn.com/en'
348
-
349
- soup_cnn = Soup(get(url1))
350
-
351
- # extract each headline from the div containing the links.
352
-
353
- cnn_text = [i.text for i in soup_cnn.find('div', {'class': 'afe4286c'}).find('a')]
354
-
355
- cnn_links = [i.attrs['href'] for i in soup_cnn.find('div', {'class': 'afe4286c'}).find('a')]
356
-
357
- cnn = [i for i in cnn_text if 'Analysis:' not in i and 'Opinion:' not in i]
358
-
359
-
360
-
361
-
362
-
363
- # Get current links...in the future you'll have to check for overlaps.
364
-
365
- url2 = 'https://text.npr.org/1001'
366
-
367
- soup = Soup(get(url2))
368
-
369
- # extract each headline
370
-
371
- npr_text = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
372
-
373
- npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
374
-
375
- npr = [i for i in npr_text if 'Opinion:' not in i]
376
-
377
-
378
-
379
- cnn_dict = {k[0]:k[1] for k in zip(cnn_text,cnn_links)}
380
-
381
- npr_dict = {k[0]:k[1] for k in zip(npr_text,npr_links)}
382
-
383
-
384
-
385
- # START Perform NER
386
-
387
- cnn_ner = {x:ner_results(ner_query(x)) for x in cnn} ###################################################################################################
388
-
389
- npr_ner = {x:ner_results(ner_query(x)) for x in npr} ################################# ################################# #################################
390
-
391
-
392
-
393
- return cnn_dict, npr_dict, cnn_ner, npr_ner
394
-
395
- ## A function to change a state variable in create_dicts() above
396
-
397
- ## that then runs it and creates updated clusters.
398
-
399
- def get_news_topics(cnn_ner, npr_ner):
400
-
401
-
402
-
403
- ## END Perform NER
404
-
405
-
406
-
407
- # Select from articles.
408
-
409
-
410
-
411
- ## Select from articles that are clusterable only. (Entities were recognized.)
412
-
413
- cnn_final = {x:npr_ner[x] for x in npr_ner.keys() if len(npr_ner[x]) != 0}
414
-
415
- npr_final = {y:cnn_ner[y] for y in cnn_ner.keys() if len(cnn_ner[y]) != 0 }
416
-
417
-
418
-
419
- # What's in the news?
420
-
421
- # Get entities named in the pool of articles we're drawing from
422
-
423
- e_list = []
424
-
425
- for i in [i for i in cnn_final.values()]:
426
-
427
- for j in i:
428
-
429
- e_list.append(j)
430
-
431
- for k in [k for k in npr_final.values()]:
432
-
433
- for j in k:
434
-
435
- e_list.append(j)
436
-
437
-
438
-
439
- # This is a dictionary with keys: the list items....
440
-
441
- clusters = {k.lower():[] for k in e_list}
442
-
443
-
444
-
445
- ## Perform Clustering
446
-
447
- for hed in cnn_final.keys():
448
-
449
- for item in cnn_final[hed]:
450
-
451
- clusters[item.lower()].append(hed) # placing the headline in the list corresponding to the dictionary key for each entity.
452
-
453
- for hed in npr_final.keys():
454
-
455
- for item in npr_final[hed]:
456
-
457
- clusters[item.lower()].append(hed)
458
-
459
-
460
-
461
- return clusters
462
-
463
-
464
-
465
-
466
-
467
- def update_topics():
468
-
469
- st.legacy_caching.clear_cache()
470
-
471
- dicts = [i for i in create_ner_dicts()]
472
-
473
- clusters = get_news_topics(cnn_ner, npr_ner)
474
-
475
- return clusters, dicts
476
-
477
-
478
-
479
-
480
-
481
- #############
482
-
483
- # SETUP #
484
-
485
- #############
486
-
487
- # Auth for HF Inference API and URL to the model we're using -- distilbart-cnn-12-6
488
-
489
- headers = {"Authorization": f"""Bearer {st.secrets["ato"]}"""}
490
-
491
- API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
492
-
493
  NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
494
-
495
- #############
496
-
497
- #PROCESSING #
498
-
499
- #############
500
-
501
- st.write(f"""**Welcome!**\nThis app lets you generate digests of topics currently in the news. Select up to three current news topics and the digest lets you know what the latest news on those topics is!""") # Can I make this disappear?
502
-
503
- cnn_dict, npr_dict, cnn_ner, npr_ner = create_ner_dicts()
504
-
505
- clusters = get_news_topics(cnn_ner, npr_ner)
 
 
 
 
 
 
 
 
 
506
 
507
  selections = []
508
-
509
- choices = [None]
510
-
511
- for i in list(clusters.keys()):
512
-
513
- choices.append(i)
514
-
515
- # button to refresh topics
516
-
517
- if st.button("Refresh topics!"):
518
-
519
- new_data = update_topics()
520
-
521
- clusters = new_data[0]
522
-
523
- cnn_dict, npr_dict, cnn_ner, npr_ner = new_data[1][0], new_data[1][1], new_data[1][2], new_data[1][3]
524
-
525
-
526
-
527
  # Form used to take 3 menu inputs
528
-
529
  with st.form(key='columns_in_form'):
530
-
531
  cols = st.columns(3)
532
-
533
  for i, col in enumerate(cols):
534
-
535
  selections.append(col.selectbox(f'Make a Selection', choices, key=i))
536
-
537
  submitted = st.form_submit_button('Submit')
538
-
539
  if submitted:
540
-
541
  selections = [i for i in selections if i is not None]
542
-
543
- with st.spinner(text="Digesting...please wait, this may take up to 20 seconds..."):
544
-
545
- digest = seek_and_sum(selections, cnn_dict, npr_dict)
546
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
  if len(digest) == 0:
548
-
549
  st.write("You didn't select a topic!")
550
-
551
  else:
552
-
553
  st.write("Your digest is ready:\n")
554
-
555
-
556
-
557
- count = 0
558
-
559
- for each in digest:
560
-
561
- count += 1
562
-
563
- st.write(each)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
+ import json
3
+ from typing import List, Set
4
+ from collections import namedtuple
5
+ from functools import lru_cache
6
+ from datetime import datetime as dt
7
+ import os, os.path
8
 
9
+ from codetiming import Timer
10
+ import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # local code
13
+ from digestor import Digestor
14
+ from source import Source
15
+ from scrape_sources import NPRLite, CNNText, stub
16
+ import random
17
+
18
+ # EDIT: before doing NER check time of last scrape and just read in from JSON store instead of rescraping
19
+ # can force rescrape
20
+ # This may take a config to get sources as input
21
+
22
+ def initialize(limit, rando, use_cache=True):
23
+ clusters: dict[str:List[namedtuple]] = dict()
24
+ # This is a container for the source classes.
25
+ # Make sure you handle this. Whats the deal.
26
+ sources:List[Source]= [] # Write them and import? Read a config?
27
+ # FOR NOW ONLY add this explicitly here.
28
+ # MUST read in final version though.
29
+ sources.append(NPRLite(
30
+ 'npr',
31
+ 'https://text.npr.org/1001',
32
+ 'sshleifer/distilbart-cnn-12-6',
33
+ 'dbmdz/bert-large-cased-finetuned-conll03-english'
34
+ ))
35
+ sources.append(CNNText(
36
+ 'cnn',
37
+ 'https://lite.cnn.com',
38
+ 'sshleifer/distilbart-cnn-12-6',
39
+ 'dbmdz/bert-large-cased-finetuned-conll03-english'
40
+ ))
41
+
42
+
43
+ # initialize list to hold cluster data namedtuples
44
+ cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])]
45
+ article_dict : dict[str:namedtuple]
46
+
47
+ # For all sources retrieve_cluster_data
48
+ # returns List[namedtuples] with empty entity lists
49
+ # TEST THIS ALL V V V
50
+ cluster_data = []
51
+ article_meta = namedtuple('article_meta',['source', 'count'])
52
+ cluster_meta : List[article_meta] = []
53
+ print("Calling data source retrieve cluster data....")
54
+ for data_source in sources:
55
+ if limit is not None:
56
+ c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources))
57
+ else:
58
+ c_data, c_meta = data_source.retrieve_cluster_data()
59
+ cluster_data.append(c_data)
60
+ cluster_meta.append(article_meta(data_source.source_name, c_meta))
61
+ print("Finished...moving on to clustering...")
62
+ cluster_data = cluster_data[0] + cluster_data[1]
63
+ # NER
64
+ # iterate the list of namedtuples,
65
+ for tup in cluster_data:
66
+ # pass each hed to the api query method, return the dict
67
+ # through the ner_results function to the 'entities' list.
68
+ # Populate stub entities list
69
+ perform_ner(tup, cache=use_cache)
70
+ generate_clusters(clusters, tup)
71
+ st.write(f"""Total number of clusters: {len(clusters)}""")
72
+
73
+ # Article stubs tracks all stubs
74
+ # If cluster is unsummarized, its hed's value is the namedtuple stub.
75
+ # Else reference digestor instance so summary can be found.
76
+ article_dict = {stub.hed: stub for stub in cluster_data}
77
+
78
+ return article_dict, clusters
79
+
80
+
81
+ # Am I going to use this for those two lines?
82
+ def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True):
83
+ with Timer(name="ner_query_time", logger=None):
84
+ result = ner_results(ner_query(
85
+ {
86
+ "inputs":tup.hed,
87
+ "paramters":
88
+ {
89
+ "use_cache": cache,
90
+ },
91
+ }
92
+ ))
93
+ for i in result:
94
+ tup.entities.append(i)
95
 
 
96
 
 
97
 
98
  def ner_query(payload):
99
+ print("making a query....")
100
  data = json.dumps(payload)
 
101
  response = requests.request("POST", NER_API_URL, headers=headers, data=data)
 
102
  return json.loads(response.content.decode("utf-8"))
103
 
 
104
 
 
 
 
 
 
105
 
106
+ def generate_clusters(
107
+ the_dict: dict,
108
+ tup : namedtuple('article_stub',[ 'link','hed','entities', 'source'])
109
+ ) -> dict:
110
+ for entity in tup.entities:
111
+ # Add cluster if entity not already in dict
112
+ if entity not in the_dict:
113
+ the_dict[entity] = []
114
+ # Add this article's link to the cluster dict
115
+ the_dict[entity].append(tup)
116
 
117
 
118
+ def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # empty lists to collect our entities
 
120
  people, places, orgs, misc = [], [], [], []
121
 
122
  # 'ent' and 'designation' handle the difference between dictionary keys
 
123
  # for aggregation strategy grouped vs ungrouped
 
124
  ent = 'entity' if not groups else 'entity_group'
 
125
  designation = 'I-' if not groups else ''
126
 
127
  # Define actions -- this is a switch-case dictionary.
128
+ # keys are the identifiers used inthe return dict from
129
+ # the ner_query.
130
+ # values are list.append() for each of the lists
131
+ # created at the top of the function. They hold sorted entities.
132
+ # actions is used to pass entities into the lists.
133
+ # Why I called it actions I have no idea rename it.
134
  actions = {designation+'PER':people.append,
135
+ designation+'LOC':places.append,
136
+ designation+'ORG':orgs.append,
137
+ designation+'MISC':misc.append
138
+ } # Is this an antipattern?
139
+
140
+ # For each dictionary in the ner result list, if the entity str doesn't contain a '#'
141
+ # and the confidence is > 90%, add the entity to the list for its type.
142
+
143
+ # actions[d[ent]](d['word']) accesses the key of actions that is returned
144
+ # from d[ent] and then passes the entity name, returned by d['word'] to
145
+ # the 'list.append' waiting to be called in the dict actions.
146
+ # Note the (). We access actions to call its append...
147
  readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
148
 
149
  # create list of all entities to return
 
150
  ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
151
 
152
  return ner_list
153
 
154
+ # These could be passed through the command line
155
+ # or read from a config file.
156
+ # One of these is needed here for NER and one in Digestor for summarization.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
158
+ headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""}
159
+
160
+ LIMIT = None # Controls time and number of clusters.
161
+ USE_CACHE = True
162
+
163
+ if not USE_CACHE:
164
+ print("NOT USING CACHE--ARE YOU GATHERING DATA?")
165
+ if LIMIT is not None:
166
+ print(f"LIMIT: {LIMIT}")
167
+
168
+ # digest store
169
+ digests = dict() # key is cluster, value is digestor object
170
+ out_dicts = []
171
+ # list to accept user choices
172
+ # retrieve cluster data and create dict to track each article (articleStubs)
173
+ # and create topic clusters by performing ner.
174
+ print("Initializing....")
175
+ article_dict, clusters = initialize(LIMIT, USE_CACHE)
176
+ # We now have clusters and cluster data. Redundancy.
177
+ # We call a display function and get the user input.
178
+ # For this its still streamlit.
179
 
180
  selections = []
181
+ choices = list(clusters.keys())
182
+ choices.insert(0,'None')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  # Form used to take 3 menu inputs
 
184
  with st.form(key='columns_in_form'):
 
185
  cols = st.columns(3)
 
186
  for i, col in enumerate(cols):
 
187
  selections.append(col.selectbox(f'Make a Selection', choices, key=i))
 
188
  submitted = st.form_submit_button('Submit')
 
189
  if submitted:
 
190
  selections = [i for i in selections if i is not None]
191
+ with st.spinner(text="Digesting...please wait, this will take a few moments...Maybe check some messages or start reading the latest papers on summarization with transformers...."):
192
+ found = False
193
+ # Check if we already have this digest.
194
+ for i in digests:
195
+ if set(list(answers.values())) == set(list(i)):
196
+ digestor = digests[i]
197
+ found = True
198
+ break
199
+
200
+ # If we need a new digest
201
+ if not found:
202
+ chosen = []
203
+ # Why not just use answers.values()?
204
+ for i in selections: # i is supposed to be a list of stubs, mostly one
205
+ if i != 'None':
206
+ for j in clusters[i]:
207
+ if j not in chosen:
208
+ chosen.append(j) # j is supposed to be a stub.
209
+
210
+ # Article dict contains stubs for unprocessed articles and lists of summarized chunks for processed ones.
211
+ # Here we put together a list of article stubs and/or summary chunks and let the digestor sort out what it does with them,
212
+ chosen = [i if isinstance(article_dict[i.hed], stub) else article_dict[i.hed] for i in chosen]
213
+ # Digestor uses 'chosen', passed through 'stubs' to create digest.
214
+ # 'user_choicese' is passed for reference.
215
+ # Passing list(answers.values()) includes 'None' choices.
216
+ digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=list(selections))
217
+ # happens internally but may be used differently so it isn't automatic upon digestor creation.
218
+ # Easily turn caching off for testing.
219
+ digestor.digest() # creates summaries and stores them associated with the digest
220
+
221
+
222
+
223
+ # Get displayable digest and digest data
224
+ digestor.build_digest()# only returns for data collection
225
+
226
+ digest = digestor.text
227
  if len(digest) == 0:
 
228
  st.write("You didn't select a topic!")
 
229
  else:
 
230
  st.write("Your digest is ready:\n")
231
+
232
+ st.write(digest)