import streamlit as st import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import nltk import spacy from burst_detection import burst_detection, enumerate_bursts, burst_weights import matplotlib.pyplot as plt import os import io import math import numpy as np import plotly.graph_objects as go from plotly.subplots import make_subplots import plotly.io as pio import sys #===config=== st.set_page_config( page_title="Coconut", page_icon="🥥", layout="wide", initial_sidebar_state="collapsed" ) hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) with st.popover("🔗 Menu"): st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠") st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣") st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣") st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣") st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣") st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣") st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣") st.header("Burst Detection", anchor=False) st.subheader('Put your file here...', anchor=False) #===clear cache=== def reset_all(): st.cache_data.clear() # Initialize NLP model nlp = spacy.load("en_core_web_md") @st.cache_data(ttl=3600) def upload(extype): df = pd.read_csv(uploaded_file) #lens.org if 'Publication Year' in df.columns: df.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by', 'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True) return df @st.cache_data(ttl=3600) def get_ext(uploaded_file): extype = uploaded_file.name return extype @st.cache_data(ttl=3600) def get_minmax(df): MIN = int(df['Year'].min()) MAX = int(df['Year'].max()) GAP = MAX - MIN return MIN, MAX, GAP @st.cache_data(ttl=3600) def conv_txt(extype): col_dict = {'TI': 'Title', 'SO': 'Source title', 'DT': 'Document Type', 'AB': 'Abstract', 'PY': 'Year'} df = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r') df.rename(columns=col_dict, inplace=True) return df # Helper Functions @st.cache_data(ttl=3600) def get_column_name(df, possible_names): """Find and return existing column names from a list of possible names.""" for name in possible_names: if name in df.columns: return name raise ValueError(f"None of the possible names {possible_names} found in DataFrame columns.") @st.cache_data(ttl=3600) def preprocess_text(text): """Lemmatize and remove stopwords from text.""" return ' '.join([token.lemma_.lower() for token in nlp(text) if token.is_alpha and not token.is_stop]) @st.cache_data(ttl=3600) def load_data(uploaded_file): """Load data from the uploaded file.""" extype = get_ext(uploaded_file) if extype.endswith('.csv'): df = upload(extype) elif extype.endswith('.txt'): df = conv_txt(extype) df['Year'] = pd.to_numeric(df['Year'], errors='coerce') df = df.dropna(subset=['Year']) df['Year'] = df['Year'].astype(int) if 'Title' in df.columns and 'Abstract' in df.columns: coldf = ['Abstract', 'Title'] elif 'Title' in df.columns: coldf = ['Title'] elif 'Abstract' in df.columns: coldf = ['Abstract'] else: coldf = sorted(df.select_dtypes(include=['object']).columns.tolist()) MIN, MAX, GAP = get_minmax(df) return df, coldf, MIN, MAX, GAP @st.cache_data(ttl=3600) def clean_data(df): years = list(range(YEAR[0],YEAR[1]+1)) df = df.loc[df['Year'].isin(years)] # Preprocess text df['processed'] = df.apply(lambda row: preprocess_text(f"{row.get(col_name, '')}"), axis=1) # Vectorize processed text vectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x.split()) X = vectorizer.fit_transform(df['processed'].tolist()) # Create DataFrame from the Document-Term Matrix (DTM) dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df['Year'].values) yearly_term_frequency = dtm.groupby(dtm.index).sum() # User inputs for top words analysis and exclusions excluded_words = [word.strip() for word in excluded_words_input.split(',')] # Identify top words, excluding specified words filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words] top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist() return yearly_term_frequency, top_words @st.cache_data(ttl=3600) def apply_burst_detection(top_words, data): all_bursts_list = [] start_year = int(data.index.min()) end_year = int(data.index.max()) all_years = range(start_year, end_year + 1) continuous_years = pd.Series(index=all_years, data=0) # Start with a series of zeros for all years years = continuous_years.index.tolist() all_freq_data = pd.DataFrame(index=years) for i, word in enumerate(top_words, start=1): # Update with actual counts where available word_counts = data[word].reindex(continuous_years.index, fill_value=0) # Convert years and counts to lists for burst detection r = continuous_years.index.tolist() # List of all years r = np.array(r, dtype=int) d = word_counts.values.tolist() # non-zero counts d = np.array(d, dtype=float) y = r.copy() if len(r) > 0 and len(d) > 0: n = len(r) q, d, r, p = burst_detection(d, r, n, s=2.0, gamma=1.0, smooth_win=1) bursts = enumerate_bursts(q, word) bursts = burst_weights(bursts, r, d, p) all_bursts_list.append(bursts) freq_data = yearly_term_frequency[word].reindex(years, fill_value=0) all_freq_data[word] = freq_data all_bursts = pd.concat(all_bursts_list, ignore_index=True) num_unique_labels = len(all_bursts['label'].unique()) num_rows = math.ceil(top_n / 2) if running_total == "Running total": all_freq_data = all_freq_data.cumsum() return all_bursts, all_freq_data, num_unique_labels, num_rows @st.cache_data(ttl=3600) def convert_df(df): return df.to_csv().encode("utf-8") @st.cache_data(ttl=3600) def scattervis(bursts, freq_data): freq_data.reset_index(inplace=True) freq_data.rename(columns={"index": "Year"}, inplace=True) freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value") freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0] wordlist = freq_data_melted["Category"].unique() years = freq_data["Year"].tolist() bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None) bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None) burst_points = [] for _, row in bursts.iterrows(): for year in range(row["begin"], row["end"] + 1): burst_points.append((year, row["label"], row["weight"])) burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"]) fig = go.Figure() # scatter trace for burst points fig.add_trace(go.Scatter( x=burst_points_df["Year"], y=burst_points_df["Category"], mode='markers', marker=dict( symbol='square', size=40, color='red', opacity=0.5), hoverinfo='text', text=burst_points_df["Weight"], showlegend=False )) # scatter trace for freq_data fig.add_trace(go.Scatter( x=freq_data_melted["Year"], y=freq_data_melted["Category"], mode='markers+text', marker=dict( symbol='square', size=30, color=freq_data_melted["Value"], colorscale='Blues', showscale=False), text=freq_data_melted["Value"], textposition="middle center", textfont=dict( size=16, color=['white' if value > freq_data_melted["Value"].max()/2 else 'black' for value in freq_data_melted["Value"]]) )) min_year = min(years) max_year = max(years) fig.update_layout( xaxis=dict(tickmode='linear', dtick=1, range=[(min_year-1), (max_year+1)], tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False), yaxis=dict(tickvals=wordlist, ticktext=wordlist, tickmode='array', tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False), plot_bgcolor='white', paper_bgcolor='white', showlegend=False, margin=dict(l=1, r=1, t=1, b=1), height=top_n*50+2, width=(max_year-min_year)*52+100, autosize=False ) fig.write_image("scatter_plot.png") st.image("scatter_plot.png") pio.write_image(fig, 'result.png', scale=4) @st.cache_data(ttl=3600) def linegraph(bursts, freq_data): fig = make_subplots(rows=num_rows, cols=2, subplot_titles=freq_data.columns[:top_n]) row, col = 1, 1 for i, column in enumerate(freq_data.columns[:top_n]): fig.add_trace(go.Scatter( x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column, line_shape='linear', hoverinfo='text', hovertext=[f"Year: {index}
Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])], text=freq_data[column], textposition='top center' ), row=row, col=col) # Add area charts for _, row_data in bursts[bursts['label'] == column].iterrows(): x_values = freq_data.index[row_data['begin']:row_data['end']+1] y_values = freq_data[column][row_data['begin']:row_data['end']+1] #middle_y = sum(y_values) / len(y_values) y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values) x_offset = 0.1 # Add area chart fig.add_trace(go.Scatter( x=x_values, y=y_values, fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)', ), row=row, col=col) align_value = "left" if running_total == "Running total" else "center" valign_value = "bottom" if running_total == "Running total" else "middle" # Add annotation for weight at the bottom fig.add_annotation( x=x_values[0] + x_offset, y=y_post, text=f"Weight: {row_data['weight']:.2f}", showarrow=False, font=dict( color="black", size=12), align=align_value, valign=valign_value, textangle=270, row=row, col=col ) col += 1 if col > 2: col = 1 row += 1 fig.update_layout( showlegend=False, margin=dict(l=20, r=20, t=100, b=20), height=num_rows * 500, width=1500 ) fig.write_image("line_graph.png") st.image("line_graph.png") pio.write_image(fig, 'result.png', scale=4) @st.cache_data(ttl=3600) def download_result(freq_data, bursts): csv1 = convert_df(freq_data) csv2 = convert_df(bursts) return csv1, csv2 uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all) if uploaded_file is not None: try: c1, c2, c3 = st.columns([3,3.5,3.5]) top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all) viz_selected = c2.selectbox("Option for visualization", ("Line graph", "Scatter plot"), on_change=reset_all) running_total = c3.selectbox("Option for counting words", ("Running total", "By occurrences each year"), on_change=reset_all) d1, d2 = st.columns([3,7]) df, coldf, MIN, MAX, GAP = load_data(uploaded_file) col_name = d1.selectbox("Select column to analyze", (coldf), on_change=reset_all) excluded_words_input = d2.text_input("Words to exclude (comma-separated)", on_change=reset_all) if (GAP != 0): YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all) else: e1.write('You only have data in ', (MAX)) sys.exit(1) yearly_term_frequency, top_words = clean_data(df) bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency) tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"]) with tab1: if bursts.empty: st.warning('We cannot detect any bursts', icon='⚠️') else: if num_unique_labels == top_n: st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️") elif num_unique_labels < top_n: st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️") if viz_selected == "Line graph": linegraph(bursts, freq_data) elif viz_selected =="Scatter plot": scattervis(bursts, freq_data) csv1, csv2 = download_result(freq_data, bursts) e1, e2, e3 = st.columns(3) with open('result.png', "rb") as file: btn = e1.download_button( label="📊 Download high resolution image", data=file, file_name="burst.png", mime="image/png") e2.download_button( "👉 Press to download list of top words", csv1, "top-keywords.csv", "text/csv") e3.download_button( "👉 Press to download the list of detected bursts", csv2, "burst.csv", "text/csv") with tab2: st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061') with tab3: st.markdown('**Li, M., Zheng, Z., & Yi, Q. (2024). The landscape of hot topics and research frontiers in Kawasaki disease: scientometric analysis. Heliyon, 10(8), e29680–e29680.** https://doi.org/10.1016/j.heliyon.2024.e29680') st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z') st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6') except: st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨") st.stop()