Spaces:
Running
Running
Update pages/5 Burst Detection.py
Browse files- pages/5 Burst Detection.py +184 -88
pages/5 Burst Detection.py
CHANGED
@@ -8,10 +8,12 @@ import spacy
|
|
8 |
from burst_detection import burst_detection, enumerate_bursts, burst_weights
|
9 |
import matplotlib.pyplot as plt
|
10 |
import os
|
|
|
11 |
import math
|
12 |
import numpy as np
|
13 |
import plotly.graph_objects as go
|
14 |
from plotly.subplots import make_subplots
|
|
|
15 |
import sys
|
16 |
|
17 |
#===config===
|
@@ -46,7 +48,7 @@ st.subheader('Put your file here...', anchor=False)
|
|
46 |
|
47 |
#===clear cache===
|
48 |
def reset_all():
|
49 |
-
|
50 |
|
51 |
# Initialize NLP model
|
52 |
nlp = spacy.load("en_core_web_md")
|
@@ -144,7 +146,6 @@ def clean_data(df):
|
|
144 |
excluded_words = [word.strip() for word in excluded_words_input.split(',')]
|
145 |
|
146 |
# Identify top words, excluding specified words
|
147 |
-
#top_words = [word for word in yearly_term_frequency.sum().nlargest(top_n).index if word not in excluded_words]
|
148 |
filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
|
149 |
top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
|
150 |
|
@@ -189,7 +190,7 @@ def apply_burst_detection(top_words, data):
|
|
189 |
|
190 |
num_unique_labels = len(all_bursts['label'].unique())
|
191 |
|
192 |
-
num_rows = math.ceil(top_n /
|
193 |
|
194 |
if running_total == "Running total":
|
195 |
all_freq_data = all_freq_data.cumsum()
|
@@ -199,19 +200,164 @@ def apply_burst_detection(top_words, data):
|
|
199 |
@st.cache_data(ttl=3600)
|
200 |
def convert_df(df):
|
201 |
return df.to_csv().encode("utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
-
# Streamlit UI for file upload
|
204 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
205 |
|
206 |
if uploaded_file is not None:
|
207 |
try:
|
208 |
-
c1, c2, c3 = st.columns([
|
209 |
-
top_n = c1.number_input("Number of top words to analyze", min_value=
|
210 |
-
|
|
|
211 |
running_total = c3.selectbox("Option for counting words",
|
212 |
("Running total", "By occurrences each year"), on_change=reset_all)
|
213 |
|
214 |
-
d1, d2 = st.columns([
|
215 |
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
|
216 |
col_name = d1.selectbox("Select column to analyze",
|
217 |
(coldf), on_change=reset_all)
|
@@ -220,9 +366,9 @@ if uploaded_file is not None:
|
|
220 |
if (GAP != 0):
|
221 |
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
222 |
else:
|
223 |
-
|
224 |
sys.exit(1)
|
225 |
-
|
226 |
yearly_term_frequency, top_words = clean_data(df)
|
227 |
|
228 |
bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
|
@@ -238,84 +384,34 @@ if uploaded_file is not None:
|
|
238 |
st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
|
239 |
elif num_unique_labels < top_n:
|
240 |
st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
|
241 |
-
|
242 |
-
fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=freq_data.columns[:top_n])
|
243 |
-
|
244 |
-
row, col = 1, 1
|
245 |
-
for i, column in enumerate(freq_data.columns[:top_n]):
|
246 |
-
fig.add_trace(go.Scatter(
|
247 |
-
x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
|
248 |
-
line_shape='linear',
|
249 |
-
hoverinfo='text',
|
250 |
-
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
|
251 |
-
text=freq_data[column],
|
252 |
-
textposition='top center'
|
253 |
-
), row=row, col=col)
|
254 |
-
|
255 |
-
# Add area charts
|
256 |
-
for _, row_data in bursts[bursts['label'] == column].iterrows():
|
257 |
-
x_values = freq_data.index[row_data['begin']:row_data['end']+1]
|
258 |
-
y_values = freq_data[column][row_data['begin']:row_data['end']+1]
|
259 |
-
|
260 |
-
#middle_y = sum(y_values) / len(y_values)
|
261 |
-
y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
|
262 |
-
x_offset = 0.1
|
263 |
-
|
264 |
-
# Add area chart
|
265 |
-
fig.add_trace(go.Scatter(
|
266 |
-
x=x_values,
|
267 |
-
y=y_values,
|
268 |
-
fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
|
269 |
-
), row=row, col=col)
|
270 |
-
|
271 |
-
align_value = "left" if running_total == "Running total" else "center"
|
272 |
-
valign_value = "bottom" if running_total == "Running total" else "middle"
|
273 |
-
|
274 |
-
# Add annotation for weight at the bottom
|
275 |
-
fig.add_annotation(
|
276 |
-
x=x_values[0] + x_offset,
|
277 |
-
y=y_post,
|
278 |
-
text=f"Weight: {row_data['weight']:.2f}",
|
279 |
-
showarrow=False,
|
280 |
-
font=dict(
|
281 |
-
color="black",
|
282 |
-
size=10
|
283 |
-
),
|
284 |
-
align=align_value,
|
285 |
-
valign=valign_value,
|
286 |
-
textangle=270,
|
287 |
-
row=row, col=col
|
288 |
-
)
|
289 |
-
|
290 |
-
col += 1
|
291 |
-
if col > num_columns:
|
292 |
-
col = 1
|
293 |
-
row += 1
|
294 |
-
|
295 |
-
fig.update_layout(
|
296 |
-
title_text="Scattertext",
|
297 |
-
showlegend=False,
|
298 |
-
height=num_rows * 400
|
299 |
-
)
|
300 |
-
|
301 |
-
st.plotly_chart(fig, theme="streamlit", use_container_width=True)
|
302 |
-
|
303 |
-
csv1 = convert_df(freq_data)
|
304 |
-
csv2 = convert_df(bursts)
|
305 |
-
|
306 |
-
e1, e2 = st.columns(2)
|
307 |
-
e1.download_button(
|
308 |
-
"Press to download list of top keywords 👈",
|
309 |
-
csv1,
|
310 |
-
"top-keywords.csv",
|
311 |
-
"text/csv")
|
312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
e2.download_button(
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
with tab2:
|
320 |
st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
|
321 |
|
@@ -324,6 +420,6 @@ if uploaded_file is not None:
|
|
324 |
st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
|
325 |
st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
|
326 |
|
327 |
-
except
|
328 |
-
st.error("
|
329 |
-
|
|
|
8 |
from burst_detection import burst_detection, enumerate_bursts, burst_weights
|
9 |
import matplotlib.pyplot as plt
|
10 |
import os
|
11 |
+
import io
|
12 |
import math
|
13 |
import numpy as np
|
14 |
import plotly.graph_objects as go
|
15 |
from plotly.subplots import make_subplots
|
16 |
+
import plotly.io as pio
|
17 |
import sys
|
18 |
|
19 |
#===config===
|
|
|
48 |
|
49 |
#===clear cache===
|
50 |
def reset_all():
|
51 |
+
st.cache_data.clear()
|
52 |
|
53 |
# Initialize NLP model
|
54 |
nlp = spacy.load("en_core_web_md")
|
|
|
146 |
excluded_words = [word.strip() for word in excluded_words_input.split(',')]
|
147 |
|
148 |
# Identify top words, excluding specified words
|
|
|
149 |
filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
|
150 |
top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
|
151 |
|
|
|
190 |
|
191 |
num_unique_labels = len(all_bursts['label'].unique())
|
192 |
|
193 |
+
num_rows = math.ceil(top_n / 2)
|
194 |
|
195 |
if running_total == "Running total":
|
196 |
all_freq_data = all_freq_data.cumsum()
|
|
|
200 |
@st.cache_data(ttl=3600)
|
201 |
def convert_df(df):
|
202 |
return df.to_csv().encode("utf-8")
|
203 |
+
|
204 |
+
@st.cache_data(ttl=3600)
|
205 |
+
def scattervis(bursts, freq_data):
|
206 |
+
freq_data.reset_index(inplace=True)
|
207 |
+
freq_data.rename(columns={"index": "Year"}, inplace=True)
|
208 |
+
|
209 |
+
freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
|
210 |
+
freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
|
211 |
+
wordlist = freq_data_melted["Category"].unique()
|
212 |
+
|
213 |
+
years = freq_data["Year"].tolist()
|
214 |
+
bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
215 |
+
bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
|
216 |
+
burst_points = []
|
217 |
+
|
218 |
+
for _, row in bursts.iterrows():
|
219 |
+
for year in range(row["begin"], row["end"] + 1):
|
220 |
+
burst_points.append((year, row["label"], row["weight"]))
|
221 |
+
|
222 |
+
burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
|
223 |
+
|
224 |
+
fig = go.Figure()
|
225 |
+
|
226 |
+
# scatter trace for burst points
|
227 |
+
fig.add_trace(go.Scatter(
|
228 |
+
x=burst_points_df["Year"],
|
229 |
+
y=burst_points_df["Category"],
|
230 |
+
mode='markers',
|
231 |
+
marker=dict(
|
232 |
+
symbol='square',
|
233 |
+
size=40,
|
234 |
+
color='red',
|
235 |
+
opacity=0.5),
|
236 |
+
hoverinfo='text',
|
237 |
+
text=burst_points_df["Weight"],
|
238 |
+
showlegend=False
|
239 |
+
))
|
240 |
+
|
241 |
+
# scatter trace for freq_data
|
242 |
+
fig.add_trace(go.Scatter(
|
243 |
+
x=freq_data_melted["Year"],
|
244 |
+
y=freq_data_melted["Category"],
|
245 |
+
mode='markers+text',
|
246 |
+
marker=dict(
|
247 |
+
symbol='square',
|
248 |
+
size=30,
|
249 |
+
color=freq_data_melted["Value"],
|
250 |
+
colorscale='Blues',
|
251 |
+
showscale=False),
|
252 |
+
text=freq_data_melted["Value"],
|
253 |
+
textposition="middle center",
|
254 |
+
textfont=dict(
|
255 |
+
size=16,
|
256 |
+
color=['white' if value > freq_data_melted["Value"].max()/2 else 'black' for value in freq_data_melted["Value"]])
|
257 |
+
))
|
258 |
+
|
259 |
+
min_year = min(years)
|
260 |
+
max_year = max(years)
|
261 |
+
|
262 |
+
fig.update_layout(
|
263 |
+
xaxis=dict(tickmode='linear', dtick=1, range=[(min_year-1), (max_year+1)], tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
|
264 |
+
yaxis=dict(tickvals=wordlist, ticktext=wordlist, tickmode='array', tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
|
265 |
+
plot_bgcolor='white',
|
266 |
+
paper_bgcolor='white',
|
267 |
+
showlegend=False,
|
268 |
+
margin=dict(l=1, r=1, t=1, b=1),
|
269 |
+
height=top_n*50+2,
|
270 |
+
width=(max_year-min_year)*52+100,
|
271 |
+
autosize=False
|
272 |
+
)
|
273 |
+
|
274 |
+
fig.write_image("scatter_plot.png")
|
275 |
+
st.image("scatter_plot.png")
|
276 |
+
pio.write_image(fig, 'result.png', scale=4)
|
277 |
+
|
278 |
+
@st.cache_data(ttl=3600)
|
279 |
+
def linegraph(bursts, freq_data):
|
280 |
+
fig = make_subplots(rows=num_rows, cols=2, subplot_titles=freq_data.columns[:top_n])
|
281 |
+
|
282 |
+
row, col = 1, 1
|
283 |
+
for i, column in enumerate(freq_data.columns[:top_n]):
|
284 |
+
fig.add_trace(go.Scatter(
|
285 |
+
x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
|
286 |
+
line_shape='linear',
|
287 |
+
hoverinfo='text',
|
288 |
+
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
|
289 |
+
text=freq_data[column],
|
290 |
+
textposition='top center'
|
291 |
+
), row=row, col=col)
|
292 |
+
|
293 |
+
# Add area charts
|
294 |
+
for _, row_data in bursts[bursts['label'] == column].iterrows():
|
295 |
+
x_values = freq_data.index[row_data['begin']:row_data['end']+1]
|
296 |
+
y_values = freq_data[column][row_data['begin']:row_data['end']+1]
|
297 |
+
|
298 |
+
#middle_y = sum(y_values) / len(y_values)
|
299 |
+
y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
|
300 |
+
x_offset = 0.1
|
301 |
+
|
302 |
+
# Add area chart
|
303 |
+
fig.add_trace(go.Scatter(
|
304 |
+
x=x_values,
|
305 |
+
y=y_values,
|
306 |
+
fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
|
307 |
+
), row=row, col=col)
|
308 |
+
|
309 |
+
align_value = "left" if running_total == "Running total" else "center"
|
310 |
+
valign_value = "bottom" if running_total == "Running total" else "middle"
|
311 |
+
|
312 |
+
# Add annotation for weight at the bottom
|
313 |
+
fig.add_annotation(
|
314 |
+
x=x_values[0] + x_offset,
|
315 |
+
y=y_post,
|
316 |
+
text=f"Weight: {row_data['weight']:.2f}",
|
317 |
+
showarrow=False,
|
318 |
+
font=dict(
|
319 |
+
color="black",
|
320 |
+
size=12),
|
321 |
+
align=align_value,
|
322 |
+
valign=valign_value,
|
323 |
+
textangle=270,
|
324 |
+
row=row, col=col
|
325 |
+
)
|
326 |
+
|
327 |
+
col += 1
|
328 |
+
if col > 2:
|
329 |
+
col = 1
|
330 |
+
row += 1
|
331 |
+
|
332 |
+
fig.update_layout(
|
333 |
+
showlegend=False,
|
334 |
+
margin=dict(l=20, r=20, t=100, b=20),
|
335 |
+
height=num_rows * 500,
|
336 |
+
width=1500
|
337 |
+
)
|
338 |
+
|
339 |
+
fig.write_image("line_graph.png")
|
340 |
+
st.image("line_graph.png")
|
341 |
+
pio.write_image(fig, 'result.png', scale=4)
|
342 |
+
|
343 |
+
@st.cache_data(ttl=3600)
|
344 |
+
def download_result(freq_data, bursts):
|
345 |
+
csv1 = convert_df(freq_data)
|
346 |
+
csv2 = convert_df(bursts)
|
347 |
+
return csv1, csv2
|
348 |
|
|
|
349 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
350 |
|
351 |
if uploaded_file is not None:
|
352 |
try:
|
353 |
+
c1, c2, c3 = st.columns([3,3.5,3.5])
|
354 |
+
top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
|
355 |
+
viz_selected = c2.selectbox("Option for visualization",
|
356 |
+
("Line graph", "Scatter plot"), on_change=reset_all)
|
357 |
running_total = c3.selectbox("Option for counting words",
|
358 |
("Running total", "By occurrences each year"), on_change=reset_all)
|
359 |
|
360 |
+
d1, d2 = st.columns([3,7])
|
361 |
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
|
362 |
col_name = d1.selectbox("Select column to analyze",
|
363 |
(coldf), on_change=reset_all)
|
|
|
366 |
if (GAP != 0):
|
367 |
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
|
368 |
else:
|
369 |
+
e1.write('You only have data in ', (MAX))
|
370 |
sys.exit(1)
|
371 |
+
|
372 |
yearly_term_frequency, top_words = clean_data(df)
|
373 |
|
374 |
bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
|
|
|
384 |
st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
|
385 |
elif num_unique_labels < top_n:
|
386 |
st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
|
388 |
+
if viz_selected == "Line graph":
|
389 |
+
linegraph(bursts, freq_data)
|
390 |
+
|
391 |
+
elif viz_selected =="Scatter plot":
|
392 |
+
scattervis(bursts, freq_data)
|
393 |
+
|
394 |
+
csv1, csv2 = download_result(freq_data, bursts)
|
395 |
+
e1, e2, e3 = st.columns(3)
|
396 |
+
with open('result.png', "rb") as file:
|
397 |
+
btn = e1.download_button(
|
398 |
+
label="📊 Download high resolution image",
|
399 |
+
data=file,
|
400 |
+
file_name="burst.png",
|
401 |
+
mime="image/png")
|
402 |
+
|
403 |
e2.download_button(
|
404 |
+
"👉 Press to download list of top words",
|
405 |
+
csv1,
|
406 |
+
"top-keywords.csv",
|
407 |
+
"text/csv")
|
408 |
|
409 |
+
e3.download_button(
|
410 |
+
"👉 Press to download the list of detected bursts",
|
411 |
+
csv2,
|
412 |
+
"burst.csv",
|
413 |
+
"text/csv")
|
414 |
+
|
415 |
with tab2:
|
416 |
st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
|
417 |
|
|
|
420 |
st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
|
421 |
st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
|
422 |
|
423 |
+
except:
|
424 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
425 |
+
st.stop()
|