File size: 16,252 Bytes
86e83a7 c98eb20 53e941f 86e83a7 8dcc3b4 86e83a7 3be1eba 86e83a7 0678bbe 86e83a7 0678bbe 86e83a7 633dfe8 86e83a7 8a24173 83d299f 86e83a7 8a24173 86e83a7 3ebbe4a 446eb69 3ebbe4a aa9d086 3ebbe4a 446eb69 3ebbe4a 3016a4d 3ebbe4a a883e93 86e83a7 ee570e2 915ce78 86e83a7 83d299f 86e83a7 c73ef20 86e83a7 c73ef20 86e83a7 c73ef20 2825378 483bcd9 9aae26a 12d66ab d63e616 f62134a 2825378 d7a972e c73ef20 d7a972e f62134a d7a972e f62134a 34852fa 33be678 0a4f5ca 86e83a7 c73ef20 0d9fac7 86e83a7 f7afba1 c73ef20 9c89ceb c0cf9de f7afba1 264654c c73ef20 de838f5 f7afba1 c73ef20 c4e82b4 86e83a7 21a972c d39e612 86e83a7 13d98ae e790551 86e83a7 c0c6fd6 86e83a7 ad821d0 86e83a7 33be678 86e83a7 ad821d0 a6d8736 b844a4c cd5196e 86e83a7 6145843 e67c0ef 86e83a7 446eb69 86e83a7 020effe 1192f02 eaf763c 733b47b 1583dd7 48140f6 1583dd7 48140f6 f8ab9a2 cef16b3 3ebbe4a eaf763c 43bd952 a3285c8 cd5196e 9798c0e 3fc7419 17c93bd 3fc7419 50dabf3 cd5196e cef16b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 |
# streamlit_app.py manages the whole TopicDig process
from typing import List, Set
from collections import namedtuple
import random
import requests
import json
import re
from datetime import datetime as dt
from codetiming import Timer
import streamlit as st
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from digestor import Digestor
from source import Source
from scrape_sources import NPRLite, CNNText, stub
@st.cache()
def initialize(limit, rando, use_cache=True):
clusters: dict[str:List[namedtuple]] = dict()
# This is a container for the source classes.
# Make sure you handle this. Whats the deal.
sources:List[Source]= [] # Write them and import? Read a config?
# FOR NOW ONLY add this explicitly here.
# MUST read in final version though.
sources.append(NPRLite(
'npr',
'https://text.npr.org/1001',
'sshleifer/distilbart-cnn-12-6',
#'google/pegasus-multi_news',
'dbmdz/bert-large-cased-finetuned-conll03-english'
))
sources.append(CNNText(
'cnn',
'https://lite.cnn.com',
'sshleifer/distilbart-cnn-12-6',
#'google/pegasus-multi_news',
'dbmdz/bert-large-cased-finetuned-conll03-english'
))
# initialize list to hold cluster data namedtuples
cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])]
article_dict : dict[str:namedtuple]
# For all sources retrieve_cluster_data
# returns List[namedtuples] with empty entity lists
cluster_data = []
article_meta = namedtuple('article_meta',['source', 'count'])
cluster_meta : List[article_meta] = []
for data_source in sources:
if limit is not None:
# c_data is a list of articleTuples and c_meta is the length of that but actually the length of one of the source lists...weird.
c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources))
else:
c_data, c_meta = data_source.retrieve_cluster_data()
cluster_data.append(c_data)
cluster_meta.append(article_meta(data_source.source_name, c_meta))
st.session_state[data_source.source_name] = f"Number of articles from source: {c_meta}"
cluster_data = cluster_data[0] + cluster_data[1]
# NER
# iterate the list of namedtuples,
for tup in cluster_data:
# pass each hed to the api query method, return the dict
# through the ner_results function to the 'entities' list.
# Populate stub entities list
perform_ner(tup, cache=use_cache)
generate_clusters(clusters, tup)
st.session_state['num_clusters'] = f"""Total number of clusters: {len(clusters)}"""
# Article stubs tracks all stubs
# If cluster is unsummarized, its hed's value is the namedtuple stub.
# Else reference digestor instance so summary can be found.
article_dict = {stub.hed: stub for stub in cluster_data}
return article_dict, clusters
# Am I going to use this for those two lines?
def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True):
with Timer(name="ner_query_time", logger=None):
result = ner_results(ner_query(
{
"inputs":tup.hed,
"paramters":
{
"use_cache": cache,
},
}
))
for i in result:
tup.entities.append(i)
def ner_query(payload):
data = json.dumps(payload)
response = requests.request("POST", NER_API_URL, headers=headers, data=data)
return json.loads(response.content.decode("utf-8"))
def generate_clusters(
the_dict: dict,
tup : namedtuple('article_stub',[ 'link','hed','entities', 'source'])
) -> dict:
for entity in tup.entities:
# Add cluster if entity not already in dict
if entity not in the_dict:
the_dict[entity] = []
# Add this article's link to the cluster dict
the_dict[entity].append(tup)
def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]:
# empty lists to collect our entities
people, places, orgs, misc = [], [], [], []
# 'ent' and 'designation' handle the difference between dictionary keys
# for aggregation strategy grouped vs ungrouped
ent = 'entity' if not groups else 'entity_group'
designation = 'I-' if not groups else ''
# Define actions -- this is a switch-case dictionary.
# keys are the identifiers used inthe return dict from
# the ner_query.
# values are list.append() for each of the lists
# created at the top of the function. They hold sorted entities.
# actions is used to pass entities into the lists.
# Why I called it actions I have no idea rename it.
actions = {designation+'PER':people.append,
designation+'LOC':places.append,
designation+'ORG':orgs.append,
designation+'MISC':misc.append
} # Is this an antipattern?
# For each dictionary in the ner result list, if the entity str doesn't contain a '#'
# and the confidence is > 90%, add the entity to the list for its type.
# actions[d[ent]](d['word']) accesses the key of actions that is returned
# from d[ent] and then passes the entity name, returned by d['word'] to
# the 'list.append' waiting to be called in the dict actions.
# Note the (). We access actions to call its append...
readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
# create list of all entities to return
ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
return ner_list
def show_length_graph():
labels = [i for i in range(outdata['article_count'])]
original_length = [outdata['summaries'][i]['original_length'] for i in outdata['summaries']]
summarized_length = [outdata['summaries'][i]['summary_length'] for i in outdata['summaries']]
x = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(figsize=(14,8))
rects1 = ax.bar(x - width/2, original_length, width, label='Original', color='lightgreen',zorder=0)
rects2 = ax.bar(x + width/2, summarized_length, width, label='Summary', color='lightblue',zorder=0)
rects3 = ax.bar(x - width/2, original_length, width, color='none',edgecolor='black', lw=1.25,zorder=1)
rects4 = ax.bar(x + width/2, summarized_length, width, color='none',edgecolor='black', lw=1.25,zorder=1)
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Text Length')
ax.set_xticks(x)
ax.set_yticks([i for i in range(0,max(original_length),max(summarized_length))])
ax.set_xticklabels(labels)
ax.set_xlabel('Source article')
ax.legend(loc='upper right')
plt.title('Original to Summarized Text Compression (space-separated tokens)')
#ax.hist(arr, bins=20)
st.pyplot(fig)
def check_for_word_and_word(in_string):
m = re.search(r'(\w\w+)\sand\s\1', in_string)
if m is not None:
return m.group()
return None
# These could be passed through the command line
# or read from a config file.
# One of these is needed here for NER and one in Digestor for summarization.
NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""}
LIMIT = 30 # Controls time and number of clusters.
USE_CACHE = True
if not USE_CACHE:
print("NOT USING CACHE")
if LIMIT is not None:
print(f"LIMIT: {LIMIT}")
# digest store am I using this though? - april 15 2022
digests = dict() # key is cluster, value is digestor object
out_dicts = [] # Am I using this? -dit
# list to accept user choices
# retrieve cluster data and create dict to track each article (articleStubs)
# and create topic clusters by performing ner.
print("Initializing....")
article_dict, clusters = initialize(LIMIT, USE_CACHE)
# We now have clusters and cluster data. Redundancy?
# Welcome and explainer
st.title("Welcome to TopicDig")
st.subheader("Automatic news article summarization with transformers!")
st.success(f"You select the topics, we summarize the relevant news and show you a digest, plus some info to help contextualize what the machine did.")
st.write(f"On the left you'll find a list of topics recently gleaned from current news headlines. TopicDig lets you assemble digests of these stories using transformers!")
st.warning("Enjoy, and remember, these summaries contain a few kinds of issues, from untruths to missing attribution or topic sentences. For more information on truthfulness in automatic summarization with transformers see https://arxiv.org/abs/2109.07958.")
st.subheader(f"How it works:")
st.write(f"""Select 1 to 3 topics from the drop down menus and click 'submit' to start generating your digest. \n\n Extra options include refreshing the topics and changing the length of summaries and consequently of the digest.""")
# Provides expandable container for refresh and summarization parameters, currently only chunk size
with st.expander("See extra options"):
st.subheader("Refresh topics: ")
st.write("You may want to refresh the topic lists if the app loaded several hours ago or you get no summary.")
# button to refresh topics
if st.button("Refresh topics!"):
article_dict, clusters = initialize(LIMIT, USE_CACHE)
st.subheader("Select chunk size: ")
st.write("Smaller chunks means more of the article included in the summary and a longer digest.")
chunk_size = st.select_slider(label="Chunk size", options=[i for i in range(50,801,50)], value=400)
selections = []
choices = list(clusters.keys())
choices.insert(0,'None')
# May be desired in sidebar - april 15 2022
# st.write(f"CNN articles: {st.session_state['cnn']}")
# st.write(f"NPR articles: {st.session_state['npr']}")
# st.write(f"Number of clusters {st.session_state['num_clusters']}")
# Display topics to user currently in sidebar - april 15 2022
st.sidebar.subheader("Topics")
st.sidebar.write("Here are the current news topics and the number of articles whose headlines featured those topics.")
show_clusters = {i:len(clusters[i]) for i in clusters.keys()}
cdf = pd.DataFrame(data={"Cluster":list(show_clusters.keys()), "Articles":list(show_clusters.values())} ).sort_values(by='Articles', ascending=False)
styler = cdf.style.hide_index()
st.sidebar.write(styler.to_html(), unsafe_allow_html=True)
# Get session time
st.session_state['dt'] = dt.now()
# Form used to take 3 menu inputs
with st.form(key='columns_in_form'):
cols = st.columns(3)
for i, col in enumerate(cols):
selections.append(col.selectbox(f'Make a Selection', choices, key=i))
submitted = st.form_submit_button('Submit')
if submitted:
selections = [i for i in selections for j in selections if i is not None]
with st.spinner(text="Creating your digest: this will take a few moments."):
chosen = []
for i in selections: # i is supposed to be a list of stubs, mostly one
if i != 'None':
for j in clusters[i]:
if j not in chosen:
chosen.append(j) # j is a stub.
# Digestor uses 'chosen' to create digest.
# 'user_choicese' is passed for reference.
digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=selections, token_limit=1024, word_limit=chunk_size)
# happens internally but may be used differently so it isn't automatic upon digestor creation.
# Easily turn caching off for testing.
st.subheader("What you'll see:")
st.write("First you'll see a list of links appear below. These are the links to the original articles being summarized for your digest, so you can get the full story if you're interested, or check the summary against the source.")
st.write("In a few moments, your machine-generated digest will appear below the links, and below that you'll see an approximate word count of your digest and the time in seconds that the whole process took!")
st.write("You'll also see a graph showing, for each article and summary, the original and summarized lengths.")
st.error("Remember: This only demos news article summarization. It is not yet completely reliable, and may distort some facts. An analysis of factfulness is in progress by the app creator.")
# st.write("Finally, you will see some possible errors detected in the summaries. This area of NLP is far from perfection and always developing. Hopefully this is an interesting step in the path!")
digestor.digest() # creates summaries and stores them associated with the digest
# Get displayable digest and digest data
outdata = digestor.build_digest()
if len(digestor.text) == 0:
st.write("No text to return...very sorry. Please hit 'refresh topics' in the options panel!")
else:
st.subheader("Your digest:")
st.info(digestor.text.replace("$","\$"))
st.subheader("Summarization stats:")
col1, col2, col3 = st.columns(3)
col1.metric("Digest Time", f"""{digestor.timer.timers['digest_time']:.2f}""", "seconds")
col2.metric("Digest Length", str(len(digestor.text.split(" "))), 'space-sep tokens' )
col3.metric("Article Count", str(outdata['article_count']), "articles" )
st.subheader("Article Compression:")
# Summarize the findings for all models
show_length_graph()
# Issues section: search for known problems with summaries
# st.header("Things to look for: ")
# st.subheader("Factfulness:")
# st.write("Automatically checking the truthfulness of a document isn't a trivial task, and is not implemented here. Users are encouraged to use their own wider knowledge to look for possible falsehoods. In the normal news a reader is understood to have a certain amount of understanding to comprehend the news. This experimental application requires a bit more, but seems promising.")
#st.subheader("Repetition:")
#rep_check = check_for_word_and_word(digestor.text)
#if rep_check is not None:
# st.write(f"Following phrases repeat: {rep_check}")
# found_index = digestor.text.find(rep_check)
# st.write("Sample:")
# st.write(f"{text[found_index-40:found_index+40]}")
#else:
# st.write("No repetition detected.")
#
# Same article from different sources
#st.subheader("Text redundancy: ")
# for each in selections:
# if each != 'None':
# # check if two source articles share a cluster and not a source.
# sources = {}
# for i in clusters[each]:
### if i[3].source_name not in sources:
# st.write(f"i[3].source_name: {i[3].source_name}")
# sources[i[3].source_name] = 0
# else:
# print("One or more articles on the same topic may have come from different sources. \n\n This may cause redundancy in the digest, though it can also add further clarity, if the two articles are significantly different.")
# break
# st.write("If more than one source have their own versions of the same topic from the same perspective, the result may be repetitive, or it may add nuance and the two summaries may complement each other.")
|