jsulz HF staff commited on
Commit
1fd02a5
1 Parent(s): 68b374e

trying something out

Browse files
Files changed (2) hide show
  1. app.py +46 -33
  2. requirements.txt +1 -1
app.py CHANGED
@@ -11,6 +11,7 @@ sotu_dataset = "jsulz/state-of-the-union-addresses"
11
  dataset = load_dataset(sotu_dataset)
12
  df = dataset["train"].to_pandas()
13
  # decode the tokens-nostop column from a byte array to a list of string
 
14
  df["tokens-nostop"] = df["tokens-nostop"].apply(
15
  lambda x: x.decode("utf-8")
16
  .replace('"', "")
@@ -18,6 +19,7 @@ df["tokens-nostop"] = df["tokens-nostop"].apply(
18
  .replace("]", "")
19
  .split(",")
20
  )
 
21
  df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
22
  # calculate the automated readibility index reading ease score for each address
23
  # automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
@@ -101,40 +103,51 @@ with gr.Blocks() as demo:
101
  )
102
 
103
  with gr.Row():
 
104
 
105
- @gr.render(inputs=[president, grams])
106
- def ngram_bar(potus, n_grams):
107
- if potus != "All" and potus is not None:
108
- if type(n_grams) is not int:
109
- n_grams = 1
110
- print(n_grams)
111
- # create a Counter object from the trigrams
112
- potus_df = df[df["potus"] == potus]
113
- # decode the tokens-nostop column from a byte array to a list of string
114
- trigrams = (
115
- potus_df["tokens-nostop"]
116
- .apply(lambda x: list(ngrams(x, n_grams)))
117
- .apply(Counter)
118
- .sum()
119
- )
120
- # get the most common trigrams
121
- common_trigrams = trigrams.most_common(20)
122
- # unzip the list of tuples and plot the trigrams and counts as a bar chart
123
- trigrams, counts = zip(*common_trigrams)
124
- # join the trigrams into a single string
125
- trigrams = [" ".join(trigram) for trigram in trigrams]
126
- # create a dataframe from the trigrams and counts
127
- trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
128
- # plot the trigrams and counts as a bar chart from matplotlib
129
- fig, ax = plt.subplots(figsize=(12, 4))
130
- ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
131
- ax.set_title("Top 20 Trigrams")
132
- ax.set_ylabel("Count")
133
- ax.set_xlabel("Trigrams")
134
- plt.xticks(rotation=45)
135
- # make it tight layout
136
- plt.tight_layout()
137
- gr.Plot(value=fig, container=True)
 
 
 
 
 
 
 
 
 
 
138
 
139
 
140
  demo.launch()
 
11
  dataset = load_dataset(sotu_dataset)
12
  df = dataset["train"].to_pandas()
13
  # decode the tokens-nostop column from a byte array to a list of string
14
+ """
15
  df["tokens-nostop"] = df["tokens-nostop"].apply(
16
  lambda x: x.decode("utf-8")
17
  .replace('"', "")
 
19
  .replace("]", "")
20
  .split(",")
21
  )
22
+ """
23
  df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
24
  # calculate the automated readibility index reading ease score for each address
25
  # automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
 
103
  )
104
 
105
  with gr.Row():
106
+ with gr.Column():
107
 
108
+ @gr.render(inputs=[president, grams])
109
+ def ngram_bar(potus, n_grams):
110
+ if potus != "All" and potus is not None:
111
+ if type(n_grams) is not int:
112
+ n_grams = 1
113
+ print(n_grams)
114
+ # create a Counter object from the trigrams
115
+ potus_df = df[df["potus"] == potus]
116
+ # decode the tokens-nostop column from a byte array to a list of string
117
+ trigrams = (
118
+ potus_df["tokens-nostop"]
119
+ .apply(lambda x: list(ngrams(x, n_grams)))
120
+ .apply(Counter)
121
+ .sum()
122
+ )
123
+ # get the most common trigrams
124
+ common_trigrams = trigrams.most_common(20)
125
+ # unzip the list of tuples and plot the trigrams and counts as a bar chart
126
+ trigrams, counts = zip(*common_trigrams)
127
+ # join the trigrams into a single string
128
+ trigrams = [" ".join(trigram) for trigram in trigrams]
129
+ # create a dataframe from the trigrams and counts
130
+ trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
131
+ # plot the trigrams and counts as a bar chart from matplotlib
132
+ """
133
+ fig, ax = plt.subplots(figsize=(12, 4))
134
+ ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
135
+ ax.set_title("Top 20 Trigrams")
136
+ ax.set_ylabel("Count")
137
+ ax.set_xlabel("Trigrams")
138
+ plt.xticks(rotation=45)
139
+ # make it tight layout
140
+ plt.tight_layout()
141
+ """
142
+ fig = px.scatter(
143
+ trigrams_df,
144
+ x="counts",
145
+ y="trigrams",
146
+ title="Top 20 Trigrams",
147
+ orientation="h",
148
+ )
149
+ print(fig)
150
+ gr.Plot(value=fig, container=True)
151
 
152
 
153
  demo.launch()
requirements.txt CHANGED
@@ -42,7 +42,7 @@ orjson==3.10.7
42
  packaging==24.1
43
  pandas==2.2.2
44
  pillow==10.4.0
45
- plotly==5.23.0
46
  pyarrow==17.0.0
47
  pydantic-core==2.20.1
48
  pydantic==2.8.2
 
42
  packaging==24.1
43
  pandas==2.2.2
44
  pillow==10.4.0
45
+ plotly
46
  pyarrow==17.0.0
47
  pydantic-core==2.20.1
48
  pydantic==2.8.2