m. polinsky commited on
Commit
86e83a7
1 Parent(s): e14ef00

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # streamlit_app.py manages the whole TopicDig process
2
+ from typing import List, Set
3
+ from collections import namedtuple
4
+ import random
5
+ import requests
6
+ import json
7
+
8
+ from codetiming import Timer
9
+ import streamlit as st
10
+
11
+ from digestor import Digestor
12
+ from source import Source
13
+ from scrape_sources import NPRLite, CNNText, stub
14
+
15
+
16
+
17
+ def initialize(limit, rando, use_cache=True):
18
+ clusters: dict[str:List[namedtuple]] = dict()
19
+ # This is a container for the source classes.
20
+ # Make sure you handle this. Whats the deal.
21
+ sources:List[Source]= [] # Write them and import? Read a config?
22
+ # FOR NOW ONLY add this explicitly here.
23
+ # MUST read in final version though.
24
+ sources.append(NPRLite(
25
+ 'npr',
26
+ 'https://text.npr.org/1001',
27
+ 'sshleifer/distilbart-cnn-12-6',
28
+ 'dbmdz/bert-large-cased-finetuned-conll03-english'
29
+ ))
30
+ sources.append(CNNText(
31
+ 'cnn',
32
+ 'https://lite.cnn.com',
33
+ 'sshleifer/distilbart-cnn-12-6',
34
+ 'dbmdz/bert-large-cased-finetuned-conll03-english'
35
+ ))
36
+
37
+
38
+ # initialize list to hold cluster data namedtuples
39
+ cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])]
40
+ article_dict : dict[str:namedtuple]
41
+
42
+ # For all sources retrieve_cluster_data
43
+ # returns List[namedtuples] with empty entity lists
44
+
45
+ cluster_data = []
46
+ article_meta = namedtuple('article_meta',['source', 'count'])
47
+ cluster_meta : List[article_meta] = []
48
+ for data_source in sources:
49
+ if limit is not None:
50
+ c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources))
51
+ else:
52
+ c_data, c_meta = data_source.retrieve_cluster_data()
53
+ cluster_data.append(c_data)
54
+ cluster_meta.append(article_meta(data_source.source_name, c_meta))
55
+ st.session_state[data_source.source_name] = f"Number of clusters from source: {data_source.source_name}\n\t{len(c_data)}"
56
+ print("Finished...moving on to clustering...")
57
+ cluster_data = cluster_data[0] + cluster_data[1]
58
+ # NER
59
+ # iterate the list of namedtuples,
60
+ for tup in cluster_data:
61
+ # pass each hed to the api query method, return the dict
62
+ # through the ner_results function to the 'entities' list.
63
+ # Populate stub entities list
64
+ perform_ner(tup, cache=use_cache)
65
+ generate_clusters(clusters, tup)
66
+ st.session_state['num_clusters'] = f"""Total number of clusters: {len(clusters)}"""
67
+
68
+ # Article stubs tracks all stubs
69
+ # If cluster is unsummarized, its hed's value is the namedtuple stub.
70
+ # Else reference digestor instance so summary can be found.
71
+ article_dict = {stub.hed: stub for stub in cluster_data}
72
+
73
+
74
+ return article_dict, clusters
75
+
76
+
77
+ # Am I going to use this for those two lines?
78
+ def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True):
79
+ with Timer(name="ner_query_time", logger=None):
80
+ result = ner_results(ner_query(
81
+ {
82
+ "inputs":tup.hed,
83
+ "paramters":
84
+ {
85
+ "use_cache": cache,
86
+ },
87
+ }
88
+ ))
89
+ for i in result:
90
+ tup.entities.append(i)
91
+
92
+
93
+ @st.cache()
94
+ def ner_query(payload):
95
+ print("making a query....")
96
+ data = json.dumps(payload)
97
+ response = requests.request("POST", NER_API_URL, headers=headers, data=data)
98
+ return json.loads(response.content.decode("utf-8"))
99
+
100
+
101
+
102
+ def generate_clusters(
103
+ the_dict: dict,
104
+ tup : namedtuple('article_stub',[ 'link','hed','entities', 'source'])
105
+ ) -> dict:
106
+ for entity in tup.entities:
107
+ # Add cluster if entity not already in dict
108
+ if entity not in the_dict:
109
+ the_dict[entity] = []
110
+ # Add this article's link to the cluster dict
111
+ the_dict[entity].append(tup)
112
+
113
+
114
+ def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]:
115
+ # empty lists to collect our entities
116
+ people, places, orgs, misc = [], [], [], []
117
+
118
+ # 'ent' and 'designation' handle the difference between dictionary keys
119
+ # for aggregation strategy grouped vs ungrouped
120
+ ent = 'entity' if not groups else 'entity_group'
121
+ designation = 'I-' if not groups else ''
122
+
123
+ # Define actions -- this is a switch-case dictionary.
124
+ # keys are the identifiers used inthe return dict from
125
+ # the ner_query.
126
+ # values are list.append() for each of the lists
127
+ # created at the top of the function. They hold sorted entities.
128
+ # actions is used to pass entities into the lists.
129
+ # Why I called it actions I have no idea rename it.
130
+ actions = {designation+'PER':people.append,
131
+ designation+'LOC':places.append,
132
+ designation+'ORG':orgs.append,
133
+ designation+'MISC':misc.append
134
+ } # Is this an antipattern?
135
+
136
+ # For each dictionary in the ner result list, if the entity str doesn't contain a '#'
137
+ # and the confidence is > 90%, add the entity to the list for its type.
138
+
139
+ # actions[d[ent]](d['word']) accesses the key of actions that is returned
140
+ # from d[ent] and then passes the entity name, returned by d['word'] to
141
+ # the 'list.append' waiting to be called in the dict actions.
142
+ # Note the (). We access actions to call its append...
143
+ readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
144
+
145
+ # create list of all entities to return
146
+ ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
147
+
148
+ return ner_list
149
+
150
+ # These could be passed through the command line
151
+ # or read from a config file.
152
+ # One of these is needed here for NER and one in Digestor for summarization.
153
+ NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
154
+ headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""}
155
+
156
+ LIMIT = 20 # Controls time and number of clusters.
157
+ USE_CACHE = True
158
+
159
+ if not USE_CACHE:
160
+ print("NOT USING CACHE--ARE YOU GATHERING DATA?")
161
+ if LIMIT is not None:
162
+ print(f"LIMIT: {LIMIT}")
163
+
164
+ # digest store
165
+ digests = dict() # key is cluster, value is digestor object
166
+ out_dicts = []
167
+ # list to accept user choices
168
+ # retrieve cluster data and create dict to track each article (articleStubs)
169
+ # and create topic clusters by performing ner.
170
+ print("Initializing....")
171
+ article_dict, clusters = initialize(LIMIT, USE_CACHE)
172
+ # We now have clusters and cluster data. Redundancy.
173
+ # We call a display function and get the user input.
174
+ # For this its still streamlit.
175
+
176
+ # button to refresh topics
177
+ if st.button("Refresh topics!"):
178
+ article_dict, clusters = initialize(LIMIT, USE_CACHE)
179
+
180
+ selections = []
181
+ choices = list(clusters.keys())
182
+ choices.insert(0,'None')
183
+
184
+ st.write(st.session_state['cnn'])
185
+ st.write(st.session_state['npr'])
186
+ st.write(st.session_state['num_clusters'])
187
+
188
+
189
+ # Form used to take 3 menu inputs
190
+ with st.form(key='columns_in_form'):
191
+ cols = st.columns(3)
192
+ for i, col in enumerate(cols):
193
+ selections.append(col.selectbox(f'Make a Selection', choices, key=i))
194
+ submitted = st.form_submit_button('Submit')
195
+ if submitted:
196
+ selections = [i for i in selections if i is not None]
197
+ with st.spinner(text="Digesting...please wait, this will take a few moments...Maybe check some messages or start reading the latest papers on summarization with transformers...."):
198
+ chosen = []
199
+
200
+ for i in selections: # i is supposed to be a list of stubs, mostly one
201
+ if i != 'None':
202
+ for j in clusters[i]:
203
+ if j not in chosen:
204
+ chosen.append(j) # j is a stub.
205
+
206
+
207
+ # Digestor uses 'chosen' to create digest.
208
+ # 'user_choicese' is passed for reference.
209
+ digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=list(selections))
210
+ # happens internally but may be used differently so it isn't automatic upon digestor creation.
211
+ # Easily turn caching off for testing.
212
+ digestor.digest() # creates summaries and stores them associated with the digest
213
+
214
+
215
+
216
+ # Get displayable digest and digest data
217
+ digestor.build_digest()
218
+
219
+
220
+ if len(digestor.text) == 0:
221
+ st.write("You didn't select a topic!")
222
+ else:
223
+ st.write("Your digest is ready:\n")
224
+
225
+ st.write(digestor.text)
226
+
227
+ "st.session_state object:", st.session_state