faizhalas commited on
Commit
33a4df1
1 Parent(s): cd16a9f

Update pages/5 Burst Detection.py

Browse files
Files changed (1) hide show
  1. pages/5 Burst Detection.py +184 -88
pages/5 Burst Detection.py CHANGED
@@ -8,10 +8,12 @@ import spacy
8
  from burst_detection import burst_detection, enumerate_bursts, burst_weights
9
  import matplotlib.pyplot as plt
10
  import os
 
11
  import math
12
  import numpy as np
13
  import plotly.graph_objects as go
14
  from plotly.subplots import make_subplots
 
15
  import sys
16
 
17
  #===config===
@@ -46,7 +48,7 @@ st.subheader('Put your file here...', anchor=False)
46
 
47
  #===clear cache===
48
  def reset_all():
49
- st.cache_data.clear()
50
 
51
  # Initialize NLP model
52
  nlp = spacy.load("en_core_web_md")
@@ -144,7 +146,6 @@ def clean_data(df):
144
  excluded_words = [word.strip() for word in excluded_words_input.split(',')]
145
 
146
  # Identify top words, excluding specified words
147
- #top_words = [word for word in yearly_term_frequency.sum().nlargest(top_n).index if word not in excluded_words]
148
  filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
149
  top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
150
 
@@ -189,7 +190,7 @@ def apply_burst_detection(top_words, data):
189
 
190
  num_unique_labels = len(all_bursts['label'].unique())
191
 
192
- num_rows = math.ceil(top_n / num_columns)
193
 
194
  if running_total == "Running total":
195
  all_freq_data = all_freq_data.cumsum()
@@ -199,19 +200,164 @@ def apply_burst_detection(top_words, data):
199
  @st.cache_data(ttl=3600)
200
  def convert_df(df):
201
  return df.to_csv().encode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- # Streamlit UI for file upload
204
  uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
205
 
206
  if uploaded_file is not None:
207
  try:
208
- c1, c2, c3 = st.columns([4,4,2])
209
- top_n = c1.number_input("Number of top words to analyze", min_value=1, value=9, step=1, on_change=reset_all)
210
- num_columns = c2.number_input("Number of columns for visualization", min_value=1, value=3, step=1, on_change=reset_all)
 
211
  running_total = c3.selectbox("Option for counting words",
212
  ("Running total", "By occurrences each year"), on_change=reset_all)
213
 
214
- d1, d2 = st.columns([4,6])
215
  df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
216
  col_name = d1.selectbox("Select column to analyze",
217
  (coldf), on_change=reset_all)
@@ -220,9 +366,9 @@ if uploaded_file is not None:
220
  if (GAP != 0):
221
  YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
222
  else:
223
- st.write('You only have data in ', (MAX))
224
  sys.exit(1)
225
-
226
  yearly_term_frequency, top_words = clean_data(df)
227
 
228
  bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
@@ -238,84 +384,34 @@ if uploaded_file is not None:
238
  st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
239
  elif num_unique_labels < top_n:
240
  st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
241
-
242
- fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=freq_data.columns[:top_n])
243
-
244
- row, col = 1, 1
245
- for i, column in enumerate(freq_data.columns[:top_n]):
246
- fig.add_trace(go.Scatter(
247
- x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
248
- line_shape='linear',
249
- hoverinfo='text',
250
- hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
251
- text=freq_data[column],
252
- textposition='top center'
253
- ), row=row, col=col)
254
-
255
- # Add area charts
256
- for _, row_data in bursts[bursts['label'] == column].iterrows():
257
- x_values = freq_data.index[row_data['begin']:row_data['end']+1]
258
- y_values = freq_data[column][row_data['begin']:row_data['end']+1]
259
-
260
- #middle_y = sum(y_values) / len(y_values)
261
- y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
262
- x_offset = 0.1
263
-
264
- # Add area chart
265
- fig.add_trace(go.Scatter(
266
- x=x_values,
267
- y=y_values,
268
- fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
269
- ), row=row, col=col)
270
-
271
- align_value = "left" if running_total == "Running total" else "center"
272
- valign_value = "bottom" if running_total == "Running total" else "middle"
273
-
274
- # Add annotation for weight at the bottom
275
- fig.add_annotation(
276
- x=x_values[0] + x_offset,
277
- y=y_post,
278
- text=f"Weight: {row_data['weight']:.2f}",
279
- showarrow=False,
280
- font=dict(
281
- color="black",
282
- size=10
283
- ),
284
- align=align_value,
285
- valign=valign_value,
286
- textangle=270,
287
- row=row, col=col
288
- )
289
-
290
- col += 1
291
- if col > num_columns:
292
- col = 1
293
- row += 1
294
-
295
- fig.update_layout(
296
- title_text="Scattertext",
297
- showlegend=False,
298
- height=num_rows * 400
299
- )
300
-
301
- st.plotly_chart(fig, theme="streamlit", use_container_width=True)
302
-
303
- csv1 = convert_df(freq_data)
304
- csv2 = convert_df(bursts)
305
-
306
- e1, e2 = st.columns(2)
307
- e1.download_button(
308
- "Press to download list of top keywords 👈",
309
- csv1,
310
- "top-keywords.csv",
311
- "text/csv")
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  e2.download_button(
314
- "Press to download the list of detected bursts 👈",
315
- csv2,
316
- "burst.csv",
317
- "text/csv")
318
 
 
 
 
 
 
 
319
  with tab2:
320
  st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
321
 
@@ -324,6 +420,6 @@ if uploaded_file is not None:
324
  st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
325
  st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
326
 
327
- except ValueError:
328
- st.error("An error occurred", icon="⚠️")
329
- sys.exit(1)
 
8
  from burst_detection import burst_detection, enumerate_bursts, burst_weights
9
  import matplotlib.pyplot as plt
10
  import os
11
+ import io
12
  import math
13
  import numpy as np
14
  import plotly.graph_objects as go
15
  from plotly.subplots import make_subplots
16
+ import plotly.io as pio
17
  import sys
18
 
19
  #===config===
 
48
 
49
  #===clear cache===
50
  def reset_all():
51
+ st.cache_data.clear()
52
 
53
  # Initialize NLP model
54
  nlp = spacy.load("en_core_web_md")
 
146
  excluded_words = [word.strip() for word in excluded_words_input.split(',')]
147
 
148
  # Identify top words, excluding specified words
 
149
  filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
150
  top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
151
 
 
190
 
191
  num_unique_labels = len(all_bursts['label'].unique())
192
 
193
+ num_rows = math.ceil(top_n / 2)
194
 
195
  if running_total == "Running total":
196
  all_freq_data = all_freq_data.cumsum()
 
200
  @st.cache_data(ttl=3600)
201
  def convert_df(df):
202
  return df.to_csv().encode("utf-8")
203
+
204
+ @st.cache_data(ttl=3600)
205
+ def scattervis(bursts, freq_data):
206
+ freq_data.reset_index(inplace=True)
207
+ freq_data.rename(columns={"index": "Year"}, inplace=True)
208
+
209
+ freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
210
+ freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
211
+ wordlist = freq_data_melted["Category"].unique()
212
+
213
+ years = freq_data["Year"].tolist()
214
+ bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
215
+ bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
216
+ burst_points = []
217
+
218
+ for _, row in bursts.iterrows():
219
+ for year in range(row["begin"], row["end"] + 1):
220
+ burst_points.append((year, row["label"], row["weight"]))
221
+
222
+ burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
223
+
224
+ fig = go.Figure()
225
+
226
+ # scatter trace for burst points
227
+ fig.add_trace(go.Scatter(
228
+ x=burst_points_df["Year"],
229
+ y=burst_points_df["Category"],
230
+ mode='markers',
231
+ marker=dict(
232
+ symbol='square',
233
+ size=40,
234
+ color='red',
235
+ opacity=0.5),
236
+ hoverinfo='text',
237
+ text=burst_points_df["Weight"],
238
+ showlegend=False
239
+ ))
240
+
241
+ # scatter trace for freq_data
242
+ fig.add_trace(go.Scatter(
243
+ x=freq_data_melted["Year"],
244
+ y=freq_data_melted["Category"],
245
+ mode='markers+text',
246
+ marker=dict(
247
+ symbol='square',
248
+ size=30,
249
+ color=freq_data_melted["Value"],
250
+ colorscale='Blues',
251
+ showscale=False),
252
+ text=freq_data_melted["Value"],
253
+ textposition="middle center",
254
+ textfont=dict(
255
+ size=16,
256
+ color=['white' if value > freq_data_melted["Value"].max()/2 else 'black' for value in freq_data_melted["Value"]])
257
+ ))
258
+
259
+ min_year = min(years)
260
+ max_year = max(years)
261
+
262
+ fig.update_layout(
263
+ xaxis=dict(tickmode='linear', dtick=1, range=[(min_year-1), (max_year+1)], tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
264
+ yaxis=dict(tickvals=wordlist, ticktext=wordlist, tickmode='array', tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
265
+ plot_bgcolor='white',
266
+ paper_bgcolor='white',
267
+ showlegend=False,
268
+ margin=dict(l=1, r=1, t=1, b=1),
269
+ height=top_n*50+2,
270
+ width=(max_year-min_year)*52+100,
271
+ autosize=False
272
+ )
273
+
274
+ fig.write_image("scatter_plot.png")
275
+ st.image("scatter_plot.png")
276
+ pio.write_image(fig, 'result.png', scale=4)
277
+
278
+ @st.cache_data(ttl=3600)
279
+ def linegraph(bursts, freq_data):
280
+ fig = make_subplots(rows=num_rows, cols=2, subplot_titles=freq_data.columns[:top_n])
281
+
282
+ row, col = 1, 1
283
+ for i, column in enumerate(freq_data.columns[:top_n]):
284
+ fig.add_trace(go.Scatter(
285
+ x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
286
+ line_shape='linear',
287
+ hoverinfo='text',
288
+ hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
289
+ text=freq_data[column],
290
+ textposition='top center'
291
+ ), row=row, col=col)
292
+
293
+ # Add area charts
294
+ for _, row_data in bursts[bursts['label'] == column].iterrows():
295
+ x_values = freq_data.index[row_data['begin']:row_data['end']+1]
296
+ y_values = freq_data[column][row_data['begin']:row_data['end']+1]
297
+
298
+ #middle_y = sum(y_values) / len(y_values)
299
+ y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
300
+ x_offset = 0.1
301
+
302
+ # Add area chart
303
+ fig.add_trace(go.Scatter(
304
+ x=x_values,
305
+ y=y_values,
306
+ fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
307
+ ), row=row, col=col)
308
+
309
+ align_value = "left" if running_total == "Running total" else "center"
310
+ valign_value = "bottom" if running_total == "Running total" else "middle"
311
+
312
+ # Add annotation for weight at the bottom
313
+ fig.add_annotation(
314
+ x=x_values[0] + x_offset,
315
+ y=y_post,
316
+ text=f"Weight: {row_data['weight']:.2f}",
317
+ showarrow=False,
318
+ font=dict(
319
+ color="black",
320
+ size=12),
321
+ align=align_value,
322
+ valign=valign_value,
323
+ textangle=270,
324
+ row=row, col=col
325
+ )
326
+
327
+ col += 1
328
+ if col > 2:
329
+ col = 1
330
+ row += 1
331
+
332
+ fig.update_layout(
333
+ showlegend=False,
334
+ margin=dict(l=20, r=20, t=100, b=20),
335
+ height=num_rows * 500,
336
+ width=1500
337
+ )
338
+
339
+ fig.write_image("line_graph.png")
340
+ st.image("line_graph.png")
341
+ pio.write_image(fig, 'result.png', scale=4)
342
+
343
+ @st.cache_data(ttl=3600)
344
+ def download_result(freq_data, bursts):
345
+ csv1 = convert_df(freq_data)
346
+ csv2 = convert_df(bursts)
347
+ return csv1, csv2
348
 
 
349
  uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
350
 
351
  if uploaded_file is not None:
352
  try:
353
+ c1, c2, c3 = st.columns([3,3.5,3.5])
354
+ top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
355
+ viz_selected = c2.selectbox("Option for visualization",
356
+ ("Line graph", "Scatter plot"), on_change=reset_all)
357
  running_total = c3.selectbox("Option for counting words",
358
  ("Running total", "By occurrences each year"), on_change=reset_all)
359
 
360
+ d1, d2 = st.columns([3,7])
361
  df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
362
  col_name = d1.selectbox("Select column to analyze",
363
  (coldf), on_change=reset_all)
 
366
  if (GAP != 0):
367
  YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
368
  else:
369
+ e1.write('You only have data in ', (MAX))
370
  sys.exit(1)
371
+
372
  yearly_term_frequency, top_words = clean_data(df)
373
 
374
  bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
 
384
  st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
385
  elif num_unique_labels < top_n:
386
  st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
+ if viz_selected == "Line graph":
389
+ linegraph(bursts, freq_data)
390
+
391
+ elif viz_selected =="Scatter plot":
392
+ scattervis(bursts, freq_data)
393
+
394
+ csv1, csv2 = download_result(freq_data, bursts)
395
+ e1, e2, e3 = st.columns(3)
396
+ with open('result.png', "rb") as file:
397
+ btn = e1.download_button(
398
+ label="📊 Download high resolution image",
399
+ data=file,
400
+ file_name="burst.png",
401
+ mime="image/png")
402
+
403
  e2.download_button(
404
+ "👉 Press to download list of top words",
405
+ csv1,
406
+ "top-keywords.csv",
407
+ "text/csv")
408
 
409
+ e3.download_button(
410
+ "👉 Press to download the list of detected bursts",
411
+ csv2,
412
+ "burst.csv",
413
+ "text/csv")
414
+
415
  with tab2:
416
  st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
417
 
 
420
  st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
421
  st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
422
 
423
+ except:
424
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
425
+ st.stop()