Spaces:
Running
Running
File size: 15,039 Bytes
9a8d079 939697d 65c7ea9 9cd5d9e 317f161 11067be 68b374e 256f7c8 9a8d079 939697d 317f161 b831af7 6c9d6d5 b831af7 6c9d6d5 b831af7 6c9d6d5 b831af7 939697d b831af7 6c9d6d5 b831af7 317f161 9a8d079 b831af7 9a8d079 b831af7 9a8d079 939697d 9a8d079 b831af7 9a8d079 939697d 65c7ea9 d736278 65c7ea9 d736278 65c7ea9 d736278 65c7ea9 d736278 65c7ea9 d736278 65c7ea9 d736278 65c7ea9 317f161 68b374e b831af7 939697d 9a8d079 11067be d736278 afdf108 68b374e d736278 9f23379 7f477b2 9a8d079 9f23379 b34ed5e 9f23379 9a8d079 9f23379 68b374e 9f23379 256f7c8 9f23379 256f7c8 9f23379 68b374e 9f23379 9a8d079 9f23379 9a8d079 9f23379 9a8d079 9f23379 9a8d079 9f23379 b831af7 9f23379 68b374e 9f23379 9a8d079 9f23379 68b374e b831af7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 |
from collections import Counter
import math
import os
import gradio as gr
from datasets import load_dataset
from nltk.util import ngrams
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from huggingface_hub import InferenceClient
import matplotlib
matplotlib.use("agg")
def load_transform_dataset():
# Load the dataset and convert it to a Pandas dataframe
sotu_dataset = "jsulz/state-of-the-union-addresses"
dataset = load_dataset(sotu_dataset)
_df = dataset["train"].to_pandas()
# Do some on-the-fly calculations
# calcualte the number of words in each address
_df["word_count"] = _df["speech_html"].apply(lambda x: len(x.split()))
# calculate the automated readibility index reading ease score for each address
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
_df["ari"] = _df["no-contractions"].apply(
lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
+ (0.5 * (len(x.split()) / len(x.split("."))))
- 21.43
)
# create a column that is the year the speach was given from the date column
_df["year"] = _df["date"].dt.year
# create a column that is a concatenation of the president's name, year, and category
_df["speech_key"] = (
_df["potus"] + " - " + _df["year"].astype(str) + " (" + _df["categories"] + ")"
)
# Sort the dataframe by date because Plotly doesn't do any of this automatically
_df = _df.sort_values(by="date")
_written = _df[_df["categories"] == "Written"]
_spoken = _df[_df["categories"] == "Spoken"]
return _df, _written, _spoken
"""
Helper functions for Plotly charts
"""
def filter_potus(potus, _df):
if potus != "All":
# Filter on the potus
potus_df = _df[_df["potus"] == potus]
else:
potus_df = _df
return potus_df
def plotly_ngrams(n_grams, potus, _df):
if potus is not None:
potus_df = filter_potus(potus, _df)
# Create a counter generator for the n-grams
trigrams = (
potus_df["tokens-nostop"]
.apply(lambda x: list(ngrams(x, n_grams)))
.apply(Counter)
.sum()
)
# get the most common trigrams
common_trigrams = trigrams.most_common(10)
# unzip the list of tuples and plot the trigrams and counts as a bar chart
trigrams, counts = zip(*common_trigrams)
# join the trigrams into a single string
trigrams = [" ".join(trigram) for trigram in trigrams]
# create a dataframe from the trigrams and counts
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
if potus == "All":
potus = "All Presidents"
fig4 = px.bar(
trigrams_df,
x="counts",
y="trigrams",
title=f"{potus}'s top {n_grams}-grams",
orientation="h",
height=400,
)
return fig4
def plotly_word_and_ari(president, _df):
potus_df = filter_potus(president, _df)
fig5 = make_subplots(specs=[[{"secondary_y": True}]])
fig5.add_trace(
go.Scatter(
x=potus_df["date"],
y=potus_df["word_count"],
name="Word Count",
),
secondary_y=False,
)
fig5.add_trace(
go.Scatter(
x=potus_df["date"],
y=potus_df["ari"],
name="ARI",
),
secondary_y=True,
)
# Add figure title
fig5.update_layout(title_text="Address Word Count and ARI")
# Set x-axis title
fig5.update_xaxes(title_text="Date of Address")
# Set y-axes titles
fig5.update_yaxes(title_text="Word Count", secondary_y=False)
fig5.update_yaxes(title_text="ARI", secondary_y=True)
return fig5
def plt_wordcloud(president, _df):
potus_df = filter_potus(president, _df)
lemmatized = potus_df["lemmatized"].apply(lambda x: " ".join(x))
# build a single string from lemmatized
lemmatized = " ".join(lemmatized)
# create a wordcloud from the lemmatized column of the dataframe
wordcloud = WordCloud(background_color="white", width=800, height=400).generate(
lemmatized
)
# create a matplotlib figure
fig6 = plt.figure(figsize=(8, 4))
# add the wordcloud to the figure
plt.tight_layout()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
return fig6
def summarization(speech_key, _df):
client = InferenceClient(model="facebook/bart-large-cnn")
chunk_len = 4000
speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
sotu_chunks = int(math.ceil(len(speech) / chunk_len))
response = []
for chunk in range(1, sotu_chunks + 1):
if chunk * 4000 < len(speech):
chunk_text = speech[(chunk - 1) * chunk_len : chunk * chunk_len]
else:
chunk_text = speech[(chunk - 1) * chunk_len :]
try:
summarization_chunk = client.summarization(
chunk_text, parameters={"truncation": "do_not_truncate"}
)
except Exception as e:
print(e)
response.append(summarization_chunk.summary_text)
return "\n\n".join(response)
def streaming(speech_key, _df):
client = InferenceClient(token=os.environ["HF_TOKEN"])
speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
speech_info = speech_key.split(" - ")
messages = []
for message in client.chat_completion(
model="Qwen/Qwen2.5-72B-Instruct",
messages=[
{
"role": "system",
"content": "You are a political scholar with a deep knowledge of State of the Union addresses. You are tasked with summarizing a speech from a given president. The speech is a mix of written and spoken addresses. The goal is to provide a concise summary of the speech with the proper historical and political context.",
},
{
"role": "user",
"content": f"The following speech is a State of the Union address from {speech_info[0]} on {speech_info[1]}. Summarize it: {speech}",
},
],
max_tokens=700,
stream=True,
):
# yield message.choices[0].delta.content
# print(message)
messages.append(message.choices[0].delta.content)
return "".join(messages)
# Create a Gradio interface with blocks
with gr.Blocks() as demo:
df, written, spoken = load_transform_dataset()
# store the dataframe in a state object before passing to component functions
df_state = gr.State(df)
# Build out the top level static charts and content
gr.Markdown(
"""
# An Interactive Dashboard for State of the Union Addresses
This dashboard provides an analysis of all State of the Union (SOTU) addresses from 1790 to 2020 including written and spoken addresses. The data is sourced from the [State of the Union Addresses dataset](https://huggingface.co/datasets/jsulz/state-of-the-union-addresses) on the Hugging Face Datasets Hub. You can read more about how the data was gathered and cleaned on the dataset card. To read the speeches, you can visit the [The American Presidency Project's State of the Union page](https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union) where this data was sourced.
"""
)
gr.Markdown(
"In addition to analyzing the content, this space also leverages the [Qwen/2.5-72B-Instruct](https://deepinfra.com/Qwen/Qwen2.5-72B-Instruct) model to summarize a speech. The model is tasked with providing a concise summary of a speech from a given president. To get a summary, go to the 'Summarize a Speech' tab."
)
with gr.Tab(label="Speech Data"):
# Basic line chart showing the total number of words in each address
gr.Markdown(
"""
## The shape of words
The line chart to the right shows the total number of words in each address. However, not all SOTUs are created equally. From 1801 to 1916, each address was a written message to Congress. In 1913, Woodrow Wilson broke with tradition and delivered his address in person. Since then, the addresses have been a mix of written and spoken (mostly spoken).
The spikes you see in the early 1970's and early 1980's are from written addresses by Richard Nixon and Jimmy Carter respectively.
Now that we have a little historical context, what does this data look like if we split things out by president? The bar chart below shows the average number of words in each address by president. The bars are grouped by written and spoken addresses.
"""
)
fig1 = px.line(
df,
x="date",
y="word_count",
title="Total Number of Words in Addresses",
line_shape="spline",
)
fig1.update_layout(
xaxis=dict(title="Date of Address"),
yaxis=dict(title="Word Count"),
)
gr.Plot(fig1, scale=2)
# group by president and category and calculate the average word count sort by date
avg_word_count = (
df.groupby(["potus", "categories"])["word_count"].mean().reset_index()
)
# Build a bar chart showing the average number of words in each address by president
fig2 = px.bar(
avg_word_count,
x="potus",
y="word_count",
title="Average Number of Words in Addresses by President",
color="categories",
barmode="group",
)
fig2.update_layout(
xaxis=dict(
title="President",
tickangle=-45, # Rotate labels 45 degrees counterclockwise
),
yaxis=dict(
title="Average Word Count",
tickangle=0, # Default label angle (horizontal)
),
legend=dict(
orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
),
)
gr.Plot(fig2)
# Create a line chart showing the Automated Readability Index in each address
with gr.Row():
ari = df[["potus", "date", "ari", "categories"]]
fig3 = px.line(
ari,
x="date",
y="ari",
title="Automated Readability Index in each Address",
line_shape="spline",
)
fig3.update_layout(
xaxis=dict(title="Date of Address"),
yaxis=dict(title="ARI Score"),
)
gr.Plot(fig3, scale=2)
gr.Markdown(
"""
The line chart to the left shows the Automated Redibility Index (ARI) for each speech by year. The ARI is calculated using the formula: 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43. In general, ARI scores correspond to U.S. grade levels. For example, an ARI of 8.0 corresponds to an 8th grade reading level.
While there are other scores that are more representative of attributes we might want to measure, they require values like syllables. The ARI is a simple score to compute with our data.
The drop off is quite noticeable, don't you think? ;)
"""
)
gr.Markdown(
"""
## Dive Deeper on Each President
Use the dropdown to select a president a go a little deeper.
To begin with, there is an [n-gram](https://en.wikipedia.org/wiki/N-gram) bar chart built from all of the given president's addresses. An n-gram is a contiguous sequence of n items from a given sample of text or speech. Because written and spoken speech is littered with so-called "stop words" such as "and", "the", and "but", they've been removed to provide a more rich (albeit sometimes more difficult to read) view of the text.
The slider only goes up to 4-grams because the data is sparse beyond that. I personally found the n-grams from our last three presidents to be less than inspiring and full of platitudes. Earlier presidents have more interesting n-grams.
Next up is a word cloud of the lemmatized text from the president's addresses. [Lemmatization](https://en.wikipedia.org/wiki/Lemmatization) is the process of grouping together the inflected forms of a word so they can be analyzed as a single item. Think of this as a more advanced version of [stemming](https://en.wikipedia.org/wiki/Stemming) where we can establish novel links between words like "better" and "good" that might otherwise be overlooked in stemming.
You can also see a line chart of word count and ARI for each address.
"""
)
# get all unique president names
presidents = df["potus"].unique()
presidents = presidents.tolist()
presidents.append("All")
# create a dropdown to select a president
president = gr.Dropdown(
label="Select a President", choices=presidents, value="All"
)
# create a text area to display the summarized speech
# create a slider for number of word grams
grams = gr.Slider(
minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
)
# show a bar chart of the top n-grams for a selected president
gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
gr.Plot(plt_wordcloud, scale=2, inputs=[president, df_state])
# show a line chart of word count and ARI for a selected president
gr.Plot(plotly_word_and_ari, inputs=[president, df_state])
with gr.Tab(label="Summarize a Speech"):
gr.Markdown("## Summarize a Speech")
gr.Markdown(
"""
Context is king; get a summary of a State of the Union now that you've seen a bit more. Use the dropdown to select a speech from a president and click the button to summarize the speech. [Qwen/2.5-72B-Instruct](https://deepinfra.com/Qwen/Qwen2.5-72B-Instruct) will provide a concise summary of the speech with the proper historical and political context.
"""
)
speeches = df["speech_key"].unique()
speeches = speeches.tolist()
speech = gr.Dropdown(label="Select a Speech", choices=speeches)
# create a dropdown to select a speech from a president
run_summarization = gr.Button(value="Summarize")
fin_speech = gr.Textbox(label="Summarized Speech", type="text", lines=10)
run_summarization.click(
streaming, inputs=[speech, df_state], outputs=[fin_speech]
)
demo.launch()
|