Spaces:
Running
Running
trying something out
Browse files- app.py +46 -33
- requirements.txt +1 -1
app.py
CHANGED
@@ -11,6 +11,7 @@ sotu_dataset = "jsulz/state-of-the-union-addresses"
|
|
11 |
dataset = load_dataset(sotu_dataset)
|
12 |
df = dataset["train"].to_pandas()
|
13 |
# decode the tokens-nostop column from a byte array to a list of string
|
|
|
14 |
df["tokens-nostop"] = df["tokens-nostop"].apply(
|
15 |
lambda x: x.decode("utf-8")
|
16 |
.replace('"', "")
|
@@ -18,6 +19,7 @@ df["tokens-nostop"] = df["tokens-nostop"].apply(
|
|
18 |
.replace("]", "")
|
19 |
.split(",")
|
20 |
)
|
|
|
21 |
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
|
22 |
# calculate the automated readibility index reading ease score for each address
|
23 |
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
|
@@ -101,40 +103,51 @@ with gr.Blocks() as demo:
|
|
101 |
)
|
102 |
|
103 |
with gr.Row():
|
|
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
|
140 |
demo.launch()
|
|
|
11 |
dataset = load_dataset(sotu_dataset)
|
12 |
df = dataset["train"].to_pandas()
|
13 |
# decode the tokens-nostop column from a byte array to a list of string
|
14 |
+
"""
|
15 |
df["tokens-nostop"] = df["tokens-nostop"].apply(
|
16 |
lambda x: x.decode("utf-8")
|
17 |
.replace('"', "")
|
|
|
19 |
.replace("]", "")
|
20 |
.split(",")
|
21 |
)
|
22 |
+
"""
|
23 |
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
|
24 |
# calculate the automated readibility index reading ease score for each address
|
25 |
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
|
|
|
103 |
)
|
104 |
|
105 |
with gr.Row():
|
106 |
+
with gr.Column():
|
107 |
|
108 |
+
@gr.render(inputs=[president, grams])
|
109 |
+
def ngram_bar(potus, n_grams):
|
110 |
+
if potus != "All" and potus is not None:
|
111 |
+
if type(n_grams) is not int:
|
112 |
+
n_grams = 1
|
113 |
+
print(n_grams)
|
114 |
+
# create a Counter object from the trigrams
|
115 |
+
potus_df = df[df["potus"] == potus]
|
116 |
+
# decode the tokens-nostop column from a byte array to a list of string
|
117 |
+
trigrams = (
|
118 |
+
potus_df["tokens-nostop"]
|
119 |
+
.apply(lambda x: list(ngrams(x, n_grams)))
|
120 |
+
.apply(Counter)
|
121 |
+
.sum()
|
122 |
+
)
|
123 |
+
# get the most common trigrams
|
124 |
+
common_trigrams = trigrams.most_common(20)
|
125 |
+
# unzip the list of tuples and plot the trigrams and counts as a bar chart
|
126 |
+
trigrams, counts = zip(*common_trigrams)
|
127 |
+
# join the trigrams into a single string
|
128 |
+
trigrams = [" ".join(trigram) for trigram in trigrams]
|
129 |
+
# create a dataframe from the trigrams and counts
|
130 |
+
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
131 |
+
# plot the trigrams and counts as a bar chart from matplotlib
|
132 |
+
"""
|
133 |
+
fig, ax = plt.subplots(figsize=(12, 4))
|
134 |
+
ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
|
135 |
+
ax.set_title("Top 20 Trigrams")
|
136 |
+
ax.set_ylabel("Count")
|
137 |
+
ax.set_xlabel("Trigrams")
|
138 |
+
plt.xticks(rotation=45)
|
139 |
+
# make it tight layout
|
140 |
+
plt.tight_layout()
|
141 |
+
"""
|
142 |
+
fig = px.scatter(
|
143 |
+
trigrams_df,
|
144 |
+
x="counts",
|
145 |
+
y="trigrams",
|
146 |
+
title="Top 20 Trigrams",
|
147 |
+
orientation="h",
|
148 |
+
)
|
149 |
+
print(fig)
|
150 |
+
gr.Plot(value=fig, container=True)
|
151 |
|
152 |
|
153 |
demo.launch()
|
requirements.txt
CHANGED
@@ -42,7 +42,7 @@ orjson==3.10.7
|
|
42 |
packaging==24.1
|
43 |
pandas==2.2.2
|
44 |
pillow==10.4.0
|
45 |
-
plotly
|
46 |
pyarrow==17.0.0
|
47 |
pydantic-core==2.20.1
|
48 |
pydantic==2.8.2
|
|
|
42 |
packaging==24.1
|
43 |
pandas==2.2.2
|
44 |
pillow==10.4.0
|
45 |
+
plotly
|
46 |
pyarrow==17.0.0
|
47 |
pydantic-core==2.20.1
|
48 |
pydantic==2.8.2
|