File size: 15,039 Bytes
9a8d079
939697d
65c7ea9
9cd5d9e
317f161
11067be
68b374e
 
256f7c8
 
9a8d079
 
939697d
 
 
 
317f161
b831af7
 
 
 
 
6c9d6d5
b831af7
 
6c9d6d5
b831af7
 
6c9d6d5
b831af7
 
 
 
939697d
 
 
 
 
 
b831af7
6c9d6d5
 
 
 
b831af7
317f161
9a8d079
 
 
 
 
b831af7
 
9a8d079
b831af7
 
 
 
 
 
 
 
 
9a8d079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939697d
 
9a8d079
 
 
 
 
 
 
 
 
 
 
b831af7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a8d079
 
939697d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65c7ea9
 
 
d736278
65c7ea9
 
d736278
65c7ea9
 
 
d736278
65c7ea9
 
 
d736278
65c7ea9
 
d736278
 
65c7ea9
 
d736278
 
65c7ea9
 
 
317f161
68b374e
b831af7
939697d
 
9a8d079
11067be
 
d736278
afdf108
68b374e
 
d736278
 
9f23379
7f477b2
9a8d079
9f23379
 
 
 
 
 
b34ed5e
9f23379
9a8d079
9f23379
 
 
 
 
68b374e
9f23379
 
256f7c8
 
9f23379
256f7c8
9f23379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68b374e
9f23379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a8d079
 
9f23379
9a8d079
9f23379
 
 
 
 
9a8d079
9f23379
 
 
9a8d079
9f23379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b831af7
9f23379
68b374e
9f23379
 
9a8d079
9f23379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68b374e
b831af7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
from collections import Counter
import math
import os
import gradio as gr
from datasets import load_dataset
from nltk.util import ngrams
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from huggingface_hub import InferenceClient
import matplotlib

matplotlib.use("agg")


def load_transform_dataset():
    # Load the dataset and convert it to a Pandas dataframe
    sotu_dataset = "jsulz/state-of-the-union-addresses"
    dataset = load_dataset(sotu_dataset)
    _df = dataset["train"].to_pandas()
    # Do some on-the-fly calculations
    # calcualte the number of words in each address
    _df["word_count"] = _df["speech_html"].apply(lambda x: len(x.split()))
    # calculate the automated readibility index reading ease score for each address
    # automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
    _df["ari"] = _df["no-contractions"].apply(
        lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
        + (0.5 * (len(x.split()) / len(x.split("."))))
        - 21.43
    )
    # create a column that is the year the speach was given from the date column
    _df["year"] = _df["date"].dt.year
    # create a column that is a concatenation of the president's name, year, and category
    _df["speech_key"] = (
        _df["potus"] + " - " + _df["year"].astype(str) + " (" + _df["categories"] + ")"
    )
    # Sort the dataframe by date because Plotly doesn't do any of this automatically
    _df = _df.sort_values(by="date")
    _written = _df[_df["categories"] == "Written"]
    _spoken = _df[_df["categories"] == "Spoken"]
    return _df, _written, _spoken


"""
Helper functions for Plotly charts
"""


def filter_potus(potus, _df):
    if potus != "All":
        # Filter on the potus
        potus_df = _df[_df["potus"] == potus]
    else:
        potus_df = _df
    return potus_df


def plotly_ngrams(n_grams, potus, _df):
    if potus is not None:
        potus_df = filter_potus(potus, _df)
        # Create a counter generator for the n-grams
        trigrams = (
            potus_df["tokens-nostop"]
            .apply(lambda x: list(ngrams(x, n_grams)))
            .apply(Counter)
            .sum()
        )
        # get the most common trigrams
        common_trigrams = trigrams.most_common(10)
        # unzip the list of tuples and plot the trigrams and counts as a bar chart
        trigrams, counts = zip(*common_trigrams)
        # join the trigrams into a single string
        trigrams = [" ".join(trigram) for trigram in trigrams]
        # create a dataframe from the trigrams and counts
        trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
        if potus == "All":
            potus = "All Presidents"
        fig4 = px.bar(
            trigrams_df,
            x="counts",
            y="trigrams",
            title=f"{potus}'s top {n_grams}-grams",
            orientation="h",
            height=400,
        )
        return fig4


def plotly_word_and_ari(president, _df):
    potus_df = filter_potus(president, _df)
    fig5 = make_subplots(specs=[[{"secondary_y": True}]])
    fig5.add_trace(
        go.Scatter(
            x=potus_df["date"],
            y=potus_df["word_count"],
            name="Word Count",
        ),
        secondary_y=False,
    )
    fig5.add_trace(
        go.Scatter(
            x=potus_df["date"],
            y=potus_df["ari"],
            name="ARI",
        ),
        secondary_y=True,
    )
    # Add figure title
    fig5.update_layout(title_text="Address Word Count and ARI")

    # Set x-axis title
    fig5.update_xaxes(title_text="Date of Address")

    # Set y-axes titles
    fig5.update_yaxes(title_text="Word Count", secondary_y=False)
    fig5.update_yaxes(title_text="ARI", secondary_y=True)
    return fig5


def plt_wordcloud(president, _df):
    potus_df = filter_potus(president, _df)
    lemmatized = potus_df["lemmatized"].apply(lambda x: " ".join(x))
    # build a single string from lemmatized
    lemmatized = " ".join(lemmatized)
    # create a wordcloud from the lemmatized column of the dataframe
    wordcloud = WordCloud(background_color="white", width=800, height=400).generate(
        lemmatized
    )
    # create a matplotlib figure
    fig6 = plt.figure(figsize=(8, 4))
    # add the wordcloud to the figure
    plt.tight_layout()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    return fig6


def summarization(speech_key, _df):
    client = InferenceClient(model="facebook/bart-large-cnn")
    chunk_len = 4000
    speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
    sotu_chunks = int(math.ceil(len(speech) / chunk_len))
    response = []
    for chunk in range(1, sotu_chunks + 1):
        if chunk * 4000 < len(speech):
            chunk_text = speech[(chunk - 1) * chunk_len : chunk * chunk_len]
        else:
            chunk_text = speech[(chunk - 1) * chunk_len :]
        try:
            summarization_chunk = client.summarization(
                chunk_text, parameters={"truncation": "do_not_truncate"}
            )
        except Exception as e:
            print(e)
        response.append(summarization_chunk.summary_text)

    return "\n\n".join(response)


def streaming(speech_key, _df):
    client = InferenceClient(token=os.environ["HF_TOKEN"])
    speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
    speech_info = speech_key.split(" - ")
    messages = []
    for message in client.chat_completion(
        model="Qwen/Qwen2.5-72B-Instruct",
        messages=[
            {
                "role": "system",
                "content": "You are a political scholar with a deep knowledge of State of the Union addresses. You are tasked with summarizing a speech from a given president. The speech is a mix of written and spoken addresses. The goal is to provide a concise summary of the speech with the proper historical and political context.",
            },
            {
                "role": "user",
                "content": f"The following speech is a State of the Union address from {speech_info[0]} on {speech_info[1]}. Summarize it: {speech}",
            },
        ],
        max_tokens=700,
        stream=True,
    ):
        # yield message.choices[0].delta.content
        # print(message)
        messages.append(message.choices[0].delta.content)
    return "".join(messages)


# Create a Gradio interface with blocks
with gr.Blocks() as demo:
    df, written, spoken = load_transform_dataset()
    # store the dataframe in a state object before passing to component functions
    df_state = gr.State(df)
    # Build out the top level static charts and content
    gr.Markdown(
        """
        # An Interactive Dashboard for State of the Union Addresses
        This dashboard provides an analysis of all State of the Union (SOTU) addresses from 1790 to 2020 including written and spoken addresses. The data is sourced from the [State of the Union Addresses dataset](https://huggingface.co/datasets/jsulz/state-of-the-union-addresses) on the Hugging Face Datasets Hub. You can read more about how the data was gathered and cleaned on the dataset card. To read the speeches, you can visit the [The American Presidency Project's State of the Union page](https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union) where this data was sourced.
        """
    )

    gr.Markdown(
        "In addition to analyzing the content, this space also leverages the [Qwen/2.5-72B-Instruct](https://deepinfra.com/Qwen/Qwen2.5-72B-Instruct) model to summarize a speech. The model is tasked with providing a concise summary of a speech from a given president. To get a summary, go to the 'Summarize a Speech' tab."
    )

    with gr.Tab(label="Speech Data"):
        # Basic line chart showing the total number of words in each address
        gr.Markdown(
            """
                    ## The shape of words
                    The line chart to the right shows the total number of words in each address. However, not all SOTUs are created equally. From 1801 to 1916, each address was a written message to Congress. In 1913, Woodrow Wilson broke with tradition and delivered his address in person. Since then, the addresses have been a mix of written and spoken (mostly spoken). 

                    The spikes you see in the early 1970's and early 1980's are from written addresses by Richard Nixon and Jimmy Carter respectively.

                    Now that we have a little historical context, what does this data look like if we split things out by president? The bar chart below shows the average number of words in each address by president. The bars are grouped by written and spoken addresses.
                    """
        )
        fig1 = px.line(
            df,
            x="date",
            y="word_count",
            title="Total Number of Words in Addresses",
            line_shape="spline",
        )
        fig1.update_layout(
            xaxis=dict(title="Date of Address"),
            yaxis=dict(title="Word Count"),
        )
        gr.Plot(fig1, scale=2)
        # group by president and category and calculate the average word count sort by date
        avg_word_count = (
            df.groupby(["potus", "categories"])["word_count"].mean().reset_index()
        )
        # Build a bar chart showing the average number of words in each address by president
        fig2 = px.bar(
            avg_word_count,
            x="potus",
            y="word_count",
            title="Average Number of Words in Addresses by President",
            color="categories",
            barmode="group",
        )
        fig2.update_layout(
            xaxis=dict(
                title="President",
                tickangle=-45,  # Rotate labels 45 degrees counterclockwise
            ),
            yaxis=dict(
                title="Average Word Count",
                tickangle=0,  # Default label angle (horizontal)
            ),
            legend=dict(
                orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
            ),
        )
        gr.Plot(fig2)

        # Create a line chart showing the Automated Readability Index in each address
        with gr.Row():
            ari = df[["potus", "date", "ari", "categories"]]
            fig3 = px.line(
                ari,
                x="date",
                y="ari",
                title="Automated Readability Index in each Address",
                line_shape="spline",
            )
            fig3.update_layout(
                xaxis=dict(title="Date of Address"),
                yaxis=dict(title="ARI Score"),
            )
            gr.Plot(fig3, scale=2)
            gr.Markdown(
                """
                    The line chart to the left shows the Automated Redibility Index (ARI) for each speech by year. The ARI is calculated using the formula: 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43. In general, ARI scores correspond to U.S. grade levels. For example, an ARI of 8.0 corresponds to an 8th grade reading level.

                    While there are other scores that are more representative of attributes we might want to measure, they require values like syllables. The ARI is a simple score to compute with our data. 

                    The drop off is quite noticeable, don't you think? ;) 
                """
            )
        gr.Markdown(
            """
                ## Dive Deeper on Each President

                Use the dropdown to select a president a go a little deeper. 
                
                To begin with, there is an [n-gram](https://en.wikipedia.org/wiki/N-gram) bar chart built from all of the given president's addresses. An n-gram is a contiguous sequence of n items from a given sample of text or speech. Because written and spoken speech is littered with so-called "stop words" such as "and", "the", and "but", they've been removed to provide a more rich (albeit sometimes more difficult to read) view of the text. 
                
                The slider only goes up to 4-grams because the data is sparse beyond that. I personally found the n-grams from our last three presidents to be less than inspiring and full of platitudes. Earlier presidents have more interesting n-grams.

                Next up is a word cloud of the lemmatized text from the president's addresses. [Lemmatization](https://en.wikipedia.org/wiki/Lemmatization) is the process of grouping together the inflected forms of a word so they can be analyzed as a single item. Think of this as a more advanced version of [stemming](https://en.wikipedia.org/wiki/Stemming) where we can establish novel links between words like "better" and "good" that might otherwise be overlooked in stemming.
                
                You can also see a line chart of word count and ARI for each address.
        """
        )
        # get all unique president names
        presidents = df["potus"].unique()
        presidents = presidents.tolist()
        presidents.append("All")

        # create a dropdown to select a president
        president = gr.Dropdown(
            label="Select a President", choices=presidents, value="All"
        )
        # create a text area to display the summarized speech
        # create a slider for number of word grams
        grams = gr.Slider(
            minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
        )

        # show a bar chart of the top n-grams for a selected president
        gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])

        gr.Plot(plt_wordcloud, scale=2, inputs=[president, df_state])

        # show a line chart of word count and ARI for a selected president
        gr.Plot(plotly_word_and_ari, inputs=[president, df_state])

    with gr.Tab(label="Summarize a Speech"):
        gr.Markdown("## Summarize a Speech")
        gr.Markdown(
            """
            Context is king; get a summary of a State of the Union now that you've seen a bit more. Use the dropdown to select a speech from a president and click the button to summarize the speech. [Qwen/2.5-72B-Instruct](https://deepinfra.com/Qwen/Qwen2.5-72B-Instruct) will provide a concise summary of the speech with the proper historical and political context.
            """
        )
        speeches = df["speech_key"].unique()
        speeches = speeches.tolist()
        speech = gr.Dropdown(label="Select a Speech", choices=speeches)
        # create a dropdown to select a speech from a president
        run_summarization = gr.Button(value="Summarize")
        fin_speech = gr.Textbox(label="Summarized Speech", type="text", lines=10)
        run_summarization.click(
            streaming, inputs=[speech, df_state], outputs=[fin_speech]
        )

demo.launch()