faizhalas commited on
Commit
0acf386
1 Parent(s): 544dcf4

Create 5 Burst Detection.py

Browse files
Files changed (1) hide show
  1. pages/5 Burst Detection.py +309 -0
pages/5 Burst Detection.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from nltk.tokenize import word_tokenize
5
+ from nltk.corpus import stopwords
6
+ import nltk
7
+ import spacy
8
+ from burst_detection import burst_detection, enumerate_bursts, burst_weights
9
+ import matplotlib.pyplot as plt
10
+ import os
11
+ import math
12
+ import numpy as np
13
+ import plotly.graph_objects as go
14
+ from plotly.subplots import make_subplots
15
+ import sys
16
+
17
+ #===config===
18
+ st.set_page_config(
19
+ page_title="Coconut",
20
+ page_icon="🥥",
21
+ layout="wide",
22
+ initial_sidebar_state="collapsed"
23
+ )
24
+
25
+ hide_streamlit_style = """
26
+ <style>
27
+ #MainMenu
28
+ {visibility: hidden;}
29
+ footer {visibility: hidden;}
30
+ [data-testid="collapsedControl"] {display: none}
31
+ </style>
32
+ """
33
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
34
+
35
+ with st.popover("🔗 Menu"):
36
+ st.page_link("Home.py", label="Home", icon="🏠")
37
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
38
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
39
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
40
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
41
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
42
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
43
+
44
+ st.header("Burst Detection", anchor=False)
45
+ st.subheader('Put your file here...', anchor=False)
46
+
47
+ #===clear cache===
48
+ def reset_all():
49
+ st.cache_data.clear()
50
+
51
+ # Initialize NLP model
52
+ nlp = spacy.load("en_core_web_md")
53
+
54
+ @st.cache_data(ttl=3600)
55
+ def upload(extype):
56
+ df = pd.read_csv(uploaded_file)
57
+ #lens.org
58
+ if 'Publication Year' in df.columns:
59
+ df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
60
+ 'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
61
+ return df
62
+
63
+ @st.cache_data(ttl=3600)
64
+ def get_ext(uploaded_file):
65
+ extype = uploaded_file.name
66
+ return extype
67
+
68
+ @st.cache_data(ttl=3600)
69
+ def get_minmax(df):
70
+ MIN = int(df['Year'].min())
71
+ MAX = int(df['Year'].max())
72
+ GAP = MAX - MIN
73
+ return MIN, MAX, GAP
74
+
75
+ @st.cache_data(ttl=3600)
76
+ def conv_txt(extype):
77
+ col_dict = {'TI': 'Title',
78
+ 'SO': 'Source title',
79
+ 'DT': 'Document Type',
80
+ 'AB': 'Abstract',
81
+ 'PY': 'Year'}
82
+ df = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
83
+ df.rename(columns=col_dict, inplace=True)
84
+ return df
85
+
86
+ # Helper Functions
87
+ @st.cache_data(ttl=3600)
88
+ def get_column_name(df, possible_names):
89
+ """Find and return existing column names from a list of possible names."""
90
+ for name in possible_names:
91
+ if name in df.columns:
92
+ return name
93
+ raise ValueError(f"None of the possible names {possible_names} found in DataFrame columns.")
94
+
95
+ @st.cache_data(ttl=3600)
96
+ def preprocess_text(text):
97
+ """Lemmatize and remove stopwords from text."""
98
+ return ' '.join([token.lemma_.lower() for token in nlp(text) if token.is_alpha and not token.is_stop])
99
+
100
+ @st.cache_data(ttl=3600)
101
+ def load_data(uploaded_file):
102
+ """Load data from the uploaded file."""
103
+ extype = get_ext(uploaded_file)
104
+ if extype.endswith('.csv'):
105
+ df = upload(extype)
106
+ elif extype.endswith('.txt'):
107
+ df = conv_txt(extype)
108
+
109
+ df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
110
+ df = df.dropna(subset=['Year'])
111
+ df['Year'] = df['Year'].astype(int)
112
+
113
+ if 'Title' in df.columns and 'Abstract' in df.columns:
114
+ coldf = ['Abstract', 'Title']
115
+ elif 'Title' in df.columns:
116
+ coldf = ['Title']
117
+ elif 'Abstract' in df.columns:
118
+ coldf = ['Abstract']
119
+ else:
120
+ coldf = sorted(df.select_dtypes(include=['object']).columns.tolist())
121
+
122
+ MIN, MAX, GAP = get_minmax(df)
123
+
124
+ return df, coldf, MIN, MAX, GAP
125
+
126
+ @st.cache_data(ttl=3600)
127
+ def clean_data(df):
128
+
129
+ years = list(range(YEAR[0],YEAR[1]+1))
130
+ df = df.loc[df['Year'].isin(years)]
131
+
132
+ # Preprocess text
133
+ df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
134
+
135
+ # Vectorize processed text
136
+ vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
137
+ X = vectorizer.fit_transform(df['processed'].tolist())
138
+
139
+ # Create DataFrame from the Document-Term Matrix (DTM)
140
+ dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
141
+ yearly_term_frequency = dtm.groupby(dtm.index).sum()
142
+
143
+ # User inputs for top words analysis and exclusions
144
+ excluded_words = [word.strip() for word in excluded_words_input.split(',')]
145
+
146
+ # Identify top words, excluding specified words
147
+ #top_words = [word for word in yearly_term_frequency.sum().nlargest(top_n).index if word not in excluded_words]
148
+ filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
149
+ top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
150
+
151
+ return yearly_term_frequency, top_words
152
+
153
+ @st.cache_data(ttl=3600)
154
+ def apply_burst_detection(top_words, data):
155
+ all_bursts_list = []
156
+
157
+ start_year = int(data.index.min())
158
+ end_year = int(data.index.max())
159
+ all_years = range(start_year, end_year + 1)
160
+
161
+ continuous_years = pd.Series(index=all_years, data=0) # Start with a series of zeros for all years
162
+
163
+ years = continuous_years.index.tolist()
164
+
165
+ all_freq_data = pd.DataFrame(index=years)
166
+
167
+ for i, word in enumerate(top_words, start=1):
168
+ # Update with actual counts where available
169
+ word_counts = data[word].reindex(continuous_years.index, fill_value=0)
170
+
171
+ # Convert years and counts to lists for burst detection
172
+ r = continuous_years.index.tolist() # List of all years
173
+ r = np.array(r, dtype=int)
174
+ d = word_counts.values.tolist() # non-zero counts
175
+ d = np.array(d, dtype=float)
176
+ y = r.copy()
177
+
178
+ if len(r) > 0 and len(d) > 0:
179
+ n = len(r)
180
+ q, d, r, p = burst_detection(d, r, n, s=2.0, gamma=1.0, smooth_win=1)
181
+ bursts = enumerate_bursts(q, word)
182
+ bursts = burst_weights(bursts, r, d, p)
183
+ all_bursts_list.append(bursts)
184
+
185
+ freq_data = yearly_term_frequency[word].reindex(years, fill_value=0)
186
+ all_freq_data[word] = freq_data
187
+
188
+ all_bursts = pd.concat(all_bursts_list, ignore_index=True)
189
+
190
+ num_unique_labels = len(all_bursts['label'].unique())
191
+
192
+ num_rows = math.ceil(top_n / num_columns)
193
+
194
+ if running_total == "Running total":
195
+ all_freq_data = all_freq_data.cumsum()
196
+
197
+ return all_bursts, all_freq_data, num_unique_labels, num_rows
198
+
199
+ # Streamlit UI for file upload
200
+ uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
201
+
202
+ if uploaded_file is not None:
203
+ try:
204
+ c1, c2, c3 = st.columns([4,4,2])
205
+ top_n = c1.number_input("Number of top words to analyze", min_value=1, value=9, step=1, on_change=reset_all)
206
+ num_columns = c2.number_input("Number of columns for visualization", min_value=1, value=3, step=1, on_change=reset_all)
207
+ running_total = c3.selectbox("Option for counting words",
208
+ ("Running total", "By occurrences each year"), on_change=reset_all)
209
+
210
+ d1, d2 = st.columns([4,6])
211
+ df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
212
+ col_name = d1.selectbox("Select column to analyze",
213
+ (coldf), on_change=reset_all)
214
+ excluded_words_input = d2.text_input("Words to exclude (comma-separated)", on_change=reset_all)
215
+
216
+ if (GAP != 0):
217
+ YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
218
+ else:
219
+ st.write('You only have data in ', (MAX))
220
+ sys.exit(1)
221
+
222
+ yearly_term_frequency, top_words = clean_data(df)
223
+
224
+ bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
225
+
226
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
227
+
228
+ with tab1:
229
+ if bursts.empty:
230
+ st.warning('We cannot detect any bursts', icon='⚠️')
231
+
232
+ else:
233
+ if num_unique_labels == top_n:
234
+ st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
235
+ elif num_unique_labels < top_n:
236
+ st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
237
+
238
+ fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=freq_data.columns[:top_n])
239
+
240
+ row, col = 1, 1
241
+ for i, column in enumerate(freq_data.columns[:top_n]):
242
+ fig.add_trace(go.Scatter(
243
+ x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
244
+ line_shape='linear',
245
+ hoverinfo='text',
246
+ hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
247
+ text=freq_data[column],
248
+ textposition='top center'
249
+ ), row=row, col=col)
250
+
251
+ # Add area charts
252
+ for _, row_data in bursts[bursts['label'] == column].iterrows():
253
+ x_values = freq_data.index[row_data['begin']:row_data['end']+1]
254
+ y_values = freq_data[column][row_data['begin']:row_data['end']+1]
255
+
256
+ #middle_y = sum(y_values) / len(y_values)
257
+ y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
258
+ x_offset = 0.1
259
+
260
+ # Add area chart
261
+ fig.add_trace(go.Scatter(
262
+ x=x_values,
263
+ y=y_values,
264
+ fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
265
+ ), row=row, col=col)
266
+
267
+ align_value = "left" if running_total == "Running total" else "center"
268
+ valign_value = "bottom" if running_total == "Running total" else "middle"
269
+
270
+ # Add annotation for weight at the bottom
271
+ fig.add_annotation(
272
+ x=x_values[0] + x_offset,
273
+ y=y_post,
274
+ text=f"Weight: {row_data['weight']:.2f}",
275
+ showarrow=False,
276
+ font=dict(
277
+ color="black",
278
+ size=10
279
+ ),
280
+ align=align_value,
281
+ valign=valign_value,
282
+ textangle=270,
283
+ row=row, col=col
284
+ )
285
+
286
+ col += 1
287
+ if col > num_columns:
288
+ col = 1
289
+ row += 1
290
+
291
+ fig.update_layout(
292
+ title_text="Scattertext",
293
+ showlegend=False,
294
+ height=num_rows * 400
295
+ )
296
+
297
+ st.plotly_chart(fig, theme="streamlit", use_container_width=True)
298
+
299
+ with tab2:
300
+ st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
301
+
302
+ with tab3:
303
+ st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680')
304
+ st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
305
+ st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
306
+
307
+ except ValueError:
308
+ st.error("An error occurred", icon="⚠️")
309
+ sys.exit(1)