jsulz HF staff commited on
Commit
939697d
1 Parent(s): 6c9d6d5

added initial sumarization

Browse files
Files changed (3) hide show
  1. app.py +50 -3
  2. poetry.lock +4 -4
  3. pyproject.toml +1 -0
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from collections import Counter
 
2
  import gradio as gr
3
  from datasets import load_dataset
4
  from nltk.util import ngrams
@@ -8,6 +9,10 @@ import plotly.graph_objects as go
8
  from plotly.subplots import make_subplots
9
  from matplotlib import pyplot as plt
10
  from wordcloud import WordCloud
 
 
 
 
11
 
12
 
13
  def load_transform_dataset():
@@ -25,6 +30,12 @@ def load_transform_dataset():
25
  + (0.5 * (len(x.split()) / len(x.split("."))))
26
  - 21.43
27
  )
 
 
 
 
 
 
28
  # Sort the dataframe by date because Plotly doesn't do any of this automatically
29
  _df = _df.sort_values(by="date")
30
  _written = _df[_df["categories"] == "Written"]
@@ -64,6 +75,8 @@ def plotly_ngrams(n_grams, potus, _df):
64
  trigrams = [" ".join(trigram) for trigram in trigrams]
65
  # create a dataframe from the trigrams and counts
66
  trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
 
 
67
  fig4 = px.bar(
68
  trigrams_df,
69
  x="counts",
@@ -124,9 +137,34 @@ def plt_wordcloud(president, _df):
124
  return fig6
125
 
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  # Create a Gradio interface with blocks
128
  with gr.Blocks() as demo:
129
  df, written, spoken = load_transform_dataset()
 
 
 
130
  # Build out the top level static charts and content
131
  gr.Markdown(
132
  """
@@ -208,6 +246,16 @@ with gr.Blocks() as demo:
208
  The drop off is quite noticeable, don't you think? ;)
209
  """
210
  )
 
 
 
 
 
 
 
 
 
 
211
  gr.Markdown(
212
  """
213
  ## Dive Deeper on Each President
@@ -227,16 +275,15 @@ with gr.Blocks() as demo:
227
  presidents = df["potus"].unique()
228
  presidents = presidents.tolist()
229
  presidents.append("All")
 
230
  # create a dropdown to select a president
231
  president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
 
232
  # create a slider for number of word grams
233
  grams = gr.Slider(
234
  minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
235
  )
236
 
237
- # store the dataframe in a state object before passing to plots
238
- df_state = gr.State(df)
239
-
240
  # show a bar chart of the top n-grams for a selected president
241
  gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
242
 
 
1
  from collections import Counter
2
+ import math
3
  import gradio as gr
4
  from datasets import load_dataset
5
  from nltk.util import ngrams
 
9
  from plotly.subplots import make_subplots
10
  from matplotlib import pyplot as plt
11
  from wordcloud import WordCloud
12
+ from huggingface_hub import InferenceClient
13
+ import matplotlib
14
+
15
+ matplotlib.use("agg")
16
 
17
 
18
  def load_transform_dataset():
 
30
  + (0.5 * (len(x.split()) / len(x.split("."))))
31
  - 21.43
32
  )
33
+ # create a column that is the year the speach was given from the date column
34
+ _df["year"] = _df["date"].dt.year
35
+ # create a column that is a concatenation of the president's name, year, and category
36
+ _df["speech_key"] = (
37
+ _df["potus"] + " - " + _df["year"].astype(str) + " (" + _df["categories"] + ")"
38
+ )
39
  # Sort the dataframe by date because Plotly doesn't do any of this automatically
40
  _df = _df.sort_values(by="date")
41
  _written = _df[_df["categories"] == "Written"]
 
75
  trigrams = [" ".join(trigram) for trigram in trigrams]
76
  # create a dataframe from the trigrams and counts
77
  trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
78
+ if potus == "All":
79
+ potus = "All Presidents"
80
  fig4 = px.bar(
81
  trigrams_df,
82
  x="counts",
 
137
  return fig6
138
 
139
 
140
+ def summarization(speech_key, _df):
141
+ client = InferenceClient(model="facebook/bart-large-cnn")
142
+ chunk_len = 4000
143
+ speech = _df[_df["speech_key"] == speech_key]["speech_html"].values[0]
144
+ sotu_chunks = int(math.ceil(len(speech) / chunk_len))
145
+ response = []
146
+ for chunk in range(1, sotu_chunks + 1):
147
+ if chunk * 4000 < len(speech):
148
+ chunk_text = speech[(chunk - 1) * chunk_len : chunk * chunk_len]
149
+ else:
150
+ chunk_text = speech[(chunk - 1) * chunk_len :]
151
+ try:
152
+ summarization_chunk = client.summarization(
153
+ chunk_text, parameters={"truncation": "do_not_truncate"}
154
+ )
155
+ except Exception as e:
156
+ print(e)
157
+ response.append(summarization_chunk.summary_text)
158
+
159
+ return "\n\n".join(response)
160
+
161
+
162
  # Create a Gradio interface with blocks
163
  with gr.Blocks() as demo:
164
  df, written, spoken = load_transform_dataset()
165
+ # store the dataframe in a state object before passing to component functions
166
+ df_state = gr.State(df)
167
+
168
  # Build out the top level static charts and content
169
  gr.Markdown(
170
  """
 
246
  The drop off is quite noticeable, don't you think? ;)
247
  """
248
  )
249
+ gr.Markdown("## Summarize a Speech")
250
+ speeches = df["speech_key"].unique()
251
+ speeches = speeches.tolist()
252
+ speech = gr.Dropdown(label="Select a Speech", choices=speeches)
253
+ # create a dropdown to select a speech from a president
254
+ run_summarization = gr.Button(value="Summarize")
255
+ fin_speech = gr.Textbox(label="Summarized Speech", type="text", lines=10)
256
+ run_summarization.click(
257
+ summarization, inputs=[speech, df_state], outputs=[fin_speech]
258
+ )
259
  gr.Markdown(
260
  """
261
  ## Dive Deeper on Each President
 
275
  presidents = df["potus"].unique()
276
  presidents = presidents.tolist()
277
  presidents.append("All")
278
+
279
  # create a dropdown to select a president
280
  president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
281
+ # create a text area to display the summarized speech
282
  # create a slider for number of word grams
283
  grams = gr.Slider(
284
  minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
285
  )
286
 
 
 
 
287
  # show a bar chart of the top n-grams for a selected president
288
  gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
289
 
poetry.lock CHANGED
@@ -851,13 +851,13 @@ zstd = ["zstandard (>=0.18.0)"]
851
 
852
  [[package]]
853
  name = "huggingface-hub"
854
- version = "0.24.6"
855
  description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
856
  optional = false
857
  python-versions = ">=3.8.0"
858
  files = [
859
- {file = "huggingface_hub-0.24.6-py3-none-any.whl", hash = "sha256:a990f3232aa985fe749bc9474060cbad75e8b2f115f6665a9fda5b9c97818970"},
860
- {file = "huggingface_hub-0.24.6.tar.gz", hash = "sha256:cc2579e761d070713eaa9c323e3debe39d5b464ae3a7261c39a9195b27bb8000"},
861
  ]
862
 
863
  [package.dependencies]
@@ -2738,4 +2738,4 @@ multidict = ">=4.0"
2738
  [metadata]
2739
  lock-version = "2.0"
2740
  python-versions = "^3.12"
2741
- content-hash = "8542704f2fdef8c09d10c94785620326a8e2c72112368ee6f2e25fa45aeeb75a"
 
851
 
852
  [[package]]
853
  name = "huggingface-hub"
854
+ version = "0.24.7"
855
  description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
856
  optional = false
857
  python-versions = ">=3.8.0"
858
  files = [
859
+ {file = "huggingface_hub-0.24.7-py3-none-any.whl", hash = "sha256:a212c555324c8a7b1ffdd07266bb7e7d69ca71aa238d27b7842d65e9a26ac3e5"},
860
+ {file = "huggingface_hub-0.24.7.tar.gz", hash = "sha256:0ad8fb756e2831da0ac0491175b960f341fe06ebcf80ed6f8728313f95fc0207"},
861
  ]
862
 
863
  [package.dependencies]
 
2738
  [metadata]
2739
  lock-version = "2.0"
2740
  python-versions = "^3.12"
2741
+ content-hash = "6140858cd5057fd978c2f09d1ec90bfda474a4ff6cb96ae17e67d134dae2bc4d"
pyproject.toml CHANGED
@@ -14,6 +14,7 @@ nltk = "^3.9.1"
14
  plotly = "^5.23.0"
15
  matplotlib = "^3.9.2"
16
  wordcloud = "^1.9.3"
 
17
 
18
  [build-system]
19
  requires = ["poetry-core"]
 
14
  plotly = "^5.23.0"
15
  matplotlib = "^3.9.2"
16
  wordcloud = "^1.9.3"
17
+ huggingface-hub = "^0.24.7"
18
 
19
  [build-system]
20
  requires = ["poetry-core"]