Spaces:
Running
Running
added initial sumarization
Browse files- app.py +50 -3
- poetry.lock +4 -4
- pyproject.toml +1 -0
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from collections import Counter
|
|
|
2 |
import gradio as gr
|
3 |
from datasets import load_dataset
|
4 |
from nltk.util import ngrams
|
@@ -8,6 +9,10 @@ import plotly.graph_objects as go
|
|
8 |
from plotly.subplots import make_subplots
|
9 |
from matplotlib import pyplot as plt
|
10 |
from wordcloud import WordCloud
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
def load_transform_dataset():
|
@@ -25,6 +30,12 @@ def load_transform_dataset():
|
|
25 |
+ (0.5 * (len(x.split()) / len(x.split("."))))
|
26 |
- 21.43
|
27 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# Sort the dataframe by date because Plotly doesn't do any of this automatically
|
29 |
_df = _df.sort_values(by="date")
|
30 |
_written = _df[_df["categories"] == "Written"]
|
@@ -64,6 +75,8 @@ def plotly_ngrams(n_grams, potus, _df):
|
|
64 |
trigrams = [" ".join(trigram) for trigram in trigrams]
|
65 |
# create a dataframe from the trigrams and counts
|
66 |
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
|
|
|
|
67 |
fig4 = px.bar(
|
68 |
trigrams_df,
|
69 |
x="counts",
|
@@ -124,9 +137,34 @@ def plt_wordcloud(president, _df):
|
|
124 |
return fig6
|
125 |
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
# Create a Gradio interface with blocks
|
128 |
with gr.Blocks() as demo:
|
129 |
df, written, spoken = load_transform_dataset()
|
|
|
|
|
|
|
130 |
# Build out the top level static charts and content
|
131 |
gr.Markdown(
|
132 |
"""
|
@@ -208,6 +246,16 @@ with gr.Blocks() as demo:
|
|
208 |
The drop off is quite noticeable, don't you think? ;)
|
209 |
"""
|
210 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
gr.Markdown(
|
212 |
"""
|
213 |
## Dive Deeper on Each President
|
@@ -227,16 +275,15 @@ with gr.Blocks() as demo:
|
|
227 |
presidents = df["potus"].unique()
|
228 |
presidents = presidents.tolist()
|
229 |
presidents.append("All")
|
|
|
230 |
# create a dropdown to select a president
|
231 |
president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
|
|
|
232 |
# create a slider for number of word grams
|
233 |
grams = gr.Slider(
|
234 |
minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
|
235 |
)
|
236 |
|
237 |
-
# store the dataframe in a state object before passing to plots
|
238 |
-
df_state = gr.State(df)
|
239 |
-
|
240 |
# show a bar chart of the top n-grams for a selected president
|
241 |
gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
|
242 |
|
|
|
1 |
from collections import Counter
|
2 |
+
import math
|
3 |
import gradio as gr
|
4 |
from datasets import load_dataset
|
5 |
from nltk.util import ngrams
|
|
|
9 |
from plotly.subplots import make_subplots
|
10 |
from matplotlib import pyplot as plt
|
11 |
from wordcloud import WordCloud
|
12 |
+
from huggingface_hub import InferenceClient
|
13 |
+
import matplotlib
|
14 |
+
|
15 |
+
matplotlib.use("agg")
|
16 |
|
17 |
|
18 |
def load_transform_dataset():
|
|
|
30 |
+ (0.5 * (len(x.split()) / len(x.split("."))))
|
31 |
- 21.43
|
32 |
)
|
33 |
+
# create a column that is the year the speach was given from the date column
|
34 |
+
_df["year"] = _df["date"].dt.year
|
35 |
+
# create a column that is a concatenation of the president's name, year, and category
|
36 |
+
_df["speech_key"] = (
|
37 |
+
_df["potus"] + " - " + _df["year"].astype(str) + " (" + _df["categories"] + ")"
|
38 |
+
)
|
39 |
# Sort the dataframe by date because Plotly doesn't do any of this automatically
|
40 |
_df = _df.sort_values(by="date")
|
41 |
_written = _df[_df["categories"] == "Written"]
|
|
|
75 |
trigrams = [" ".join(trigram) for trigram in trigrams]
|
76 |
# create a dataframe from the trigrams and counts
|
77 |
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
78 |
+
if potus == "All":
|
79 |
+
potus = "All Presidents"
|
80 |
fig4 = px.bar(
|
81 |
trigrams_df,
|
82 |
x="counts",
|
|
|
137 |
return fig6
|
138 |
|
139 |
|
140 |
+
def summarization(speech_key, _df):
|
141 |
+
client = InferenceClient(model="facebook/bart-large-cnn")
|
142 |
+
chunk_len = 4000
|
143 |
+
speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
|
144 |
+
sotu_chunks = int(math.ceil(len(speech) / chunk_len))
|
145 |
+
response = []
|
146 |
+
for chunk in range(1, sotu_chunks + 1):
|
147 |
+
if chunk * 4000 < len(speech):
|
148 |
+
chunk_text = speech[(chunk - 1) * chunk_len : chunk * chunk_len]
|
149 |
+
else:
|
150 |
+
chunk_text = speech[(chunk - 1) * chunk_len :]
|
151 |
+
try:
|
152 |
+
summarization_chunk = client.summarization(
|
153 |
+
chunk_text, parameters={"truncation": "do_not_truncate"}
|
154 |
+
)
|
155 |
+
except Exception as e:
|
156 |
+
print(e)
|
157 |
+
response.append(summarization_chunk.summary_text)
|
158 |
+
|
159 |
+
return "\n\n".join(response)
|
160 |
+
|
161 |
+
|
162 |
# Create a Gradio interface with blocks
|
163 |
with gr.Blocks() as demo:
|
164 |
df, written, spoken = load_transform_dataset()
|
165 |
+
# store the dataframe in a state object before passing to component functions
|
166 |
+
df_state = gr.State(df)
|
167 |
+
|
168 |
# Build out the top level static charts and content
|
169 |
gr.Markdown(
|
170 |
"""
|
|
|
246 |
The drop off is quite noticeable, don't you think? ;)
|
247 |
"""
|
248 |
)
|
249 |
+
gr.Markdown("## Summarize a Speech")
|
250 |
+
speeches = df["speech_key"].unique()
|
251 |
+
speeches = speeches.tolist()
|
252 |
+
speech = gr.Dropdown(label="Select a Speech", choices=speeches)
|
253 |
+
# create a dropdown to select a speech from a president
|
254 |
+
run_summarization = gr.Button(value="Summarize")
|
255 |
+
fin_speech = gr.Textbox(label="Summarized Speech", type="text", lines=10)
|
256 |
+
run_summarization.click(
|
257 |
+
summarization, inputs=[speech, df_state], outputs=[fin_speech]
|
258 |
+
)
|
259 |
gr.Markdown(
|
260 |
"""
|
261 |
## Dive Deeper on Each President
|
|
|
275 |
presidents = df["potus"].unique()
|
276 |
presidents = presidents.tolist()
|
277 |
presidents.append("All")
|
278 |
+
|
279 |
# create a dropdown to select a president
|
280 |
president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
|
281 |
+
# create a text area to display the summarized speech
|
282 |
# create a slider for number of word grams
|
283 |
grams = gr.Slider(
|
284 |
minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
|
285 |
)
|
286 |
|
|
|
|
|
|
|
287 |
# show a bar chart of the top n-grams for a selected president
|
288 |
gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
|
289 |
|
poetry.lock
CHANGED
@@ -851,13 +851,13 @@ zstd = ["zstandard (>=0.18.0)"]
|
|
851 |
|
852 |
[[package]]
|
853 |
name = "huggingface-hub"
|
854 |
-
version = "0.24.
|
855 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
856 |
optional = false
|
857 |
python-versions = ">=3.8.0"
|
858 |
files = [
|
859 |
-
{file = "huggingface_hub-0.24.
|
860 |
-
{file = "huggingface_hub-0.24.
|
861 |
]
|
862 |
|
863 |
[package.dependencies]
|
@@ -2738,4 +2738,4 @@ multidict = ">=4.0"
|
|
2738 |
[metadata]
|
2739 |
lock-version = "2.0"
|
2740 |
python-versions = "^3.12"
|
2741 |
-
content-hash = "
|
|
|
851 |
|
852 |
[[package]]
|
853 |
name = "huggingface-hub"
|
854 |
+
version = "0.24.7"
|
855 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
856 |
optional = false
|
857 |
python-versions = ">=3.8.0"
|
858 |
files = [
|
859 |
+
{file = "huggingface_hub-0.24.7-py3-none-any.whl", hash = "sha256:a212c555324c8a7b1ffdd07266bb7e7d69ca71aa238d27b7842d65e9a26ac3e5"},
|
860 |
+
{file = "huggingface_hub-0.24.7.tar.gz", hash = "sha256:0ad8fb756e2831da0ac0491175b960f341fe06ebcf80ed6f8728313f95fc0207"},
|
861 |
]
|
862 |
|
863 |
[package.dependencies]
|
|
|
2738 |
[metadata]
|
2739 |
lock-version = "2.0"
|
2740 |
python-versions = "^3.12"
|
2741 |
+
content-hash = "6140858cd5057fd978c2f09d1ec90bfda474a4ff6cb96ae17e67d134dae2bc4d"
|
pyproject.toml
CHANGED
@@ -14,6 +14,7 @@ nltk = "^3.9.1"
|
|
14 |
plotly = "^5.23.0"
|
15 |
matplotlib = "^3.9.2"
|
16 |
wordcloud = "^1.9.3"
|
|
|
17 |
|
18 |
[build-system]
|
19 |
requires = ["poetry-core"]
|
|
|
14 |
plotly = "^5.23.0"
|
15 |
matplotlib = "^3.9.2"
|
16 |
wordcloud = "^1.9.3"
|
17 |
+
huggingface-hub = "^0.24.7"
|
18 |
|
19 |
[build-system]
|
20 |
requires = ["poetry-core"]
|