Create 5 Burst
Browse files- pages/5 Burst +309 -0
pages/5 Burst
@@ -0,0 +1,309 @@
1 |
import streamlit as st
2 |
import pandas as pd
3 |
from sklearn.feature_extraction.text import CountVectorizer
4 |
from nltk.tokenize import word_tokenize
5 |
from nltk.corpus import stopwords
6 |
import nltk
7 |
import spacy
8 |
from burst_detection import burst_detection, enumerate_bursts, burst_weights
9 |
import matplotlib.pyplot as plt
10 |
import os
11 |
import math
12 |
import numpy as np
13 |
import plotly.graph_objects as go
14 |
from plotly.subplots import make_subplots
15 |
import sys
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
hide_streamlit_style = """
26 |
27 |
28 |
{visibility: hidden;}
29 |
footer {visibility: hidden;}
30 |
[data-testid="collapsedControl"] {display: none}
31 |
32 |
33 |
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
34 |
35 |
with st.popover("🔗 Menu"):
36 |
st.page_link("", label="Home", icon="🏠")
37 |
st.page_link("pages/1", label="Scattertext", icon="1️⃣")
38 |
st.page_link("pages/2 Topic", label="Topic Modeling", icon="2️⃣")
39 |
st.page_link("pages/3 Bidirected", label="Bidirected Network", icon="3️⃣")
40 |
st.page_link("pages/4", label="Sunburst", icon="4️⃣")
41 |
st.page_link("pages/5 Burst", label="Burst Detection", icon="5️⃣")
42 |
st.page_link("pages/6 Keywords", label="Keywords Stem", icon="6️⃣")
43 |
44 |
st.header("Burst Detection", anchor=False)
45 |
st.subheader('Put your file here...', anchor=False)
46 |
47 |
#===clear cache===
48 |
def reset_all():
49 |
50 |
51 |
# Initialize NLP model
52 |
nlp = spacy.load("en_core_web_md")
53 |
54 |
55 |
def upload(extype):
56 |
df = pd.read_csv(uploaded_file)
57 |
58 |
if 'Publication Year' in df.columns:
59 |
df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
60 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
61 |
return df
62 |
63 |
64 |
def get_ext(uploaded_file):
65 |
extype =
66 |
return extype
67 |
68 |
69 |
def get_minmax(df):
70 |
MIN = int(df['Year'].min())
71 |
MAX = int(df['Year'].max())
72 |
73 |
return MIN, MAX, GAP
74 |
75 |
76 |
def conv_txt(extype):
77 |
col_dict = {'TI': 'Title',
78 |
'SO': 'Source title',
79 |
'DT': 'Document Type',
80 |
'AB': 'Abstract',
81 |
'PY': 'Year'}
82 |
df = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
83 |
df.rename(columns=col_dict, inplace=True)
84 |
return df
85 |
86 |
# Helper Functions
87 |
88 |
def get_column_name(df, possible_names):
89 |
"""Find and return existing column names from a list of possible names."""
90 |
for name in possible_names:
91 |
if name in df.columns:
92 |
return name
93 |
raise ValueError(f"None of the possible names {possible_names} found in DataFrame columns.")
94 |
95 |
96 |
def preprocess_text(text):
97 |
"""Lemmatize and remove stopwords from text."""
98 |
return ' '.join([token.lemma_.lower() for token in nlp(text) if token.is_alpha and not token.is_stop])
99 |
100 |
101 |
def load_data(uploaded_file):
102 |
"""Load data from the uploaded file."""
103 |
extype = get_ext(uploaded_file)
104 |
if extype.endswith('.csv'):
105 |
df = upload(extype)
106 |
elif extype.endswith('.txt'):
107 |
df = conv_txt(extype)
108 |
109 |
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
110 |
df = df.dropna(subset=['Year'])
111 |
df['Year'] = df['Year'].astype(int)
112 |
113 |
if 'Title' in df.columns and 'Abstract' in df.columns:
114 |
coldf = ['Abstract', 'Title']
115 |
elif 'Title' in df.columns:
116 |
coldf = ['Title']
117 |
elif 'Abstract' in df.columns:
118 |
coldf = ['Abstract']
119 |
120 |
coldf = sorted(df.select_dtypes(include=['object']).columns.tolist())
121 |
122 |
MIN, MAX, GAP = get_minmax(df)
123 |
124 |
return df, coldf, MIN, MAX, GAP
125 |
126 |
127 |
def clean_data(df):
128 |
129 |
years = list(range(YEAR[0],YEAR[1]+1))
130 |
df = df.loc[df['Year'].isin(years)]
131 |
132 |
# Preprocess text
133 |
df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1)
134 |
135 |
# Vectorize processed text
136 |
vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split())
137 |
X = vectorizer.fit_transform(df['processed'].tolist())
138 |
139 |
# Create DataFrame from the Document-Term Matrix (DTM)
140 |
dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values)
141 |
yearly_term_frequency = dtm.groupby(dtm.index).sum()
142 |
143 |
# User inputs for top words analysis and exclusions
144 |
excluded_words = [word.strip() for word in excluded_words_input.split(',')]
145 |
146 |
# Identify top words, excluding specified words
147 |
#top_words = [word for word in yearly_term_frequency.sum().nlargest(top_n).index if word not in excluded_words]
148 |
filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
149 |
top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
150 |
151 |
return yearly_term_frequency, top_words
152 |
153 |
154 |
def apply_burst_detection(top_words, data):
155 |
all_bursts_list = []
156 |
157 |
start_year = int(data.index.min())
158 |
end_year = int(data.index.max())
159 |
all_years = range(start_year, end_year + 1)
160 |
161 |
continuous_years = pd.Series(index=all_years, data=0) # Start with a series of zeros for all years
162 |
163 |
years = continuous_years.index.tolist()
164 |
165 |
all_freq_data = pd.DataFrame(index=years)
166 |
167 |
for i, word in enumerate(top_words, start=1):
168 |
# Update with actual counts where available
169 |
word_counts = data[word].reindex(continuous_years.index, fill_value=0)
170 |
171 |
# Convert years and counts to lists for burst detection
172 |
r = continuous_years.index.tolist() # List of all years
173 |
r = np.array(r, dtype=int)
174 |
d = word_counts.values.tolist() # non-zero counts
175 |
d = np.array(d, dtype=float)
176 |
y = r.copy()
177 |
178 |
if len(r) > 0 and len(d) > 0:
179 |
n = len(r)
180 |
q, d, r, p = burst_detection(d, r, n, s=2.0, gamma=1.0, smooth_win=1)
181 |
bursts = enumerate_bursts(q, word)
182 |
bursts = burst_weights(bursts, r, d, p)
183 |
184 |
185 |
freq_data = yearly_term_frequency[word].reindex(years, fill_value=0)
186 |
all_freq_data[word] = freq_data
187 |
188 |
all_bursts = pd.concat(all_bursts_list, ignore_index=True)
189 |
190 |
num_unique_labels = len(all_bursts['label'].unique())
191 |
192 |
num_rows = math.ceil(top_n / num_columns)
193 |
194 |
if running_total == "Running total":
195 |
all_freq_data = all_freq_data.cumsum()
196 |
197 |
return all_bursts, all_freq_data, num_unique_labels, num_rows
198 |
199 |
# Streamlit UI for file upload
200 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
201 |
202 |
if uploaded_file is not None:
203 |
204 |
c1, c2, c3 = st.columns([4,4,2])
205 |
top_n = c1.number_input("Number of top words to analyze", min_value=1, value=9, step=1, on_change=reset_all)
206 |
num_columns = c2.number_input("Number of columns for visualization", min_value=1, value=3, step=1, on_change=reset_all)
207 |
running_total = c3.selectbox("Option for counting words",
208 |
("Running total", "By occurrences each year"), on_change=reset_all)
209 |
210 |
d1, d2 = st.columns([4,6])
211 |
df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
212 |
col_name = d1.selectbox("Select column to analyze",
213 |
(coldf), on_change=reset_all)
214 |
excluded_words_input = d2.text_input("Words to exclude (comma-separated)", on_change=reset_all)
215 |
216 |
if (GAP != 0):
217 |
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
218 |
219 |
st.write('You only have data in ', (MAX))
220 |
221 |
222 |
yearly_term_frequency, top_words = clean_data(df)
223 |
224 |
bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
225 |
226 |
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
227 |
228 |
with tab1:
229 |
if bursts.empty:
230 |
st.warning('We cannot detect any bursts', icon='⚠️')
231 |
232 |
233 |
if num_unique_labels == top_n:
234 |
+'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
235 |
elif num_unique_labels < top_n:
236 |
+'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
237 |
238 |
fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=freq_data.columns[:top_n])
239 |
240 |
row, col = 1, 1
241 |
for i, column in enumerate(freq_data.columns[:top_n]):
242 |
243 |
x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
244 |
245 |
246 |
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
247 |
248 |
textposition='top center'
249 |
), row=row, col=col)
250 |
251 |
# Add area charts
252 |
for _, row_data in bursts[bursts['label'] == column].iterrows():
253 |
x_values = freq_data.index[row_data['begin']:row_data['end']+1]
254 |
y_values = freq_data[column][row_data['begin']:row_data['end']+1]
255 |
256 |
#middle_y = sum(y_values) / len(y_values)
257 |
y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
258 |
x_offset = 0.1
259 |
260 |
# Add area chart
261 |
262 |
263 |
264 |
fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
265 |
), row=row, col=col)
266 |
267 |
align_value = "left" if running_total == "Running total" else "center"
268 |
valign_value = "bottom" if running_total == "Running total" else "middle"
269 |
270 |
# Add annotation for weight at the bottom
271 |
272 |
x=x_values[0] + x_offset,
273 |
274 |
text=f"Weight: {row_data['weight']:.2f}",
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
row=row, col=col
284 |
285 |
286 |
col += 1
287 |
if col > num_columns:
288 |
col = 1
289 |
row += 1
290 |
291 |
292 |
293 |
294 |
height=num_rows * 400
295 |
296 |
297 |
st.plotly_chart(fig, theme="streamlit", use_container_width=True)
298 |
299 |
with tab2:
300 |
st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.**')
301 |
302 |
with tab3:
303 |
st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.**')
304 |
st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).**')
305 |
st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.**')
306 |
307 |
except ValueError:
308 |
st.error("An error occurred", icon="⚠️")
309 |