Spaces:

faizhalas
/

coconut

Running

App Files Files Community

faizhalas commited on Aug 5

Commit

33a4df1

•

1 Parent(s): cd16a9f

Update pages/5 Burst Detection.py

Browse files

Files changed (1) hide show

pages/5 Burst Detection.py +184 -88

pages/5 Burst Detection.py CHANGED Viewed

@@ -8,10 +8,12 @@ import spacy
 from burst_detection import burst_detection, enumerate_bursts, burst_weights
 import matplotlib.pyplot as plt
 import os
 import math
 import numpy as np
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import sys
 #===config===
@@ -46,7 +48,7 @@ st.subheader('Put your file here...', anchor=False)
 #===clear cache===
 def reset_all():
-     st.cache_data.clear()
 # Initialize NLP model
 nlp = spacy.load("en_core_web_md")
@@ -144,7 +146,6 @@ def clean_data(df):
     excluded_words = [word.strip() for word in excluded_words_input.split(',')]
     # Identify top words, excluding specified words
-    #top_words = [word for word in yearly_term_frequency.sum().nlargest(top_n).index if word not in excluded_words]
     filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
     top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
@@ -189,7 +190,7 @@ def apply_burst_detection(top_words, data):
     num_unique_labels = len(all_bursts['label'].unique())
-    num_rows = math.ceil(top_n / num_columns)
     if running_total == "Running total":
         all_freq_data = all_freq_data.cumsum()
@@ -199,19 +200,164 @@ def apply_burst_detection(top_words, data):
 @st.cache_data(ttl=3600)
 def convert_df(df):
     return df.to_csv().encode("utf-8")
-# Streamlit UI for file upload
 uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
 if uploaded_file is not None:
     try:
-        c1, c2, c3 = st.columns([4,4,2])
-        top_n = c1.number_input("Number of top words to analyze", min_value=1, value=9, step=1, on_change=reset_all)
-        num_columns = c2.number_input("Number of columns for visualization", min_value=1, value=3, step=1, on_change=reset_all)
         running_total = c3.selectbox("Option for counting words",
             ("Running total", "By occurrences each year"), on_change=reset_all)
-        d1, d2 = st.columns([4,6])
         df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
         col_name = d1.selectbox("Select column to analyze",
             (coldf), on_change=reset_all)
@@ -220,9 +366,9 @@ if uploaded_file is not None:
         if (GAP != 0):
             YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
         else:
-            st.write('You only have data in ', (MAX))
             sys.exit(1)
         yearly_term_frequency, top_words = clean_data(df)
         bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
@@ -238,84 +384,34 @@ if uploaded_file is not None:
                     st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
                 elif num_unique_labels < top_n:
                     st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
-                fig = make_subplots(rows=num_rows, cols=num_columns, subplot_titles=freq_data.columns[:top_n])
-                row, col = 1, 1
-                for i, column in enumerate(freq_data.columns[:top_n]):
-                    fig.add_trace(go.Scatter(
-                        x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
-                        line_shape='linear',
-                        hoverinfo='text',
-                        hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
-                        text=freq_data[column],
-                        textposition='top center'
-                    ), row=row, col=col)
-                    # Add area charts
-                    for _, row_data in bursts[bursts['label'] == column].iterrows():
-                        x_values = freq_data.index[row_data['begin']:row_data['end']+1]
-                        y_values = freq_data[column][row_data['begin']:row_data['end']+1]
-                        #middle_y = sum(y_values) / len(y_values)
-                        y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
-                        x_offset = 0.1
-                        # Add area chart
-                        fig.add_trace(go.Scatter(
-                            x=x_values,
-                            y=y_values,
-                            fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
-                        ), row=row, col=col)
-                        align_value = "left" if running_total == "Running total" else "center"
-                        valign_value = "bottom" if running_total == "Running total" else "middle"
-                        # Add annotation for weight at the bottom
-                        fig.add_annotation(
-                            x=x_values[0] + x_offset,
-                            y=y_post,
-                            text=f"Weight: {row_data['weight']:.2f}",
-                            showarrow=False,
-                            font=dict(
-                                color="black",
-                                size=10
-                            ),
-                            align=align_value,
-                            valign=valign_value,
-                            textangle=270,
-                            row=row, col=col
-                        )
-                    col += 1
-                    if col > num_columns:
-                        col = 1
-                        row += 1
-                fig.update_layout(
-                    title_text="Scattertext",
-                    showlegend=False,
-                    height=num_rows * 400
-                )
-                st.plotly_chart(fig, theme="streamlit", use_container_width=True)
-                csv1 = convert_df(freq_data)
-                csv2 = convert_df(bursts)
-                e1, e2 = st.columns(2)
-                e1.download_button(
-                     "Press to download list of top keywords 👈",
-                     csv1,
-                     "top-keywords.csv",
-                     "text/csv")
                 e2.download_button(
-                     "Press to download the list of detected bursts 👈",
-                     csv2,
-                     "burst.csv",
-                     "text/csv")
         with tab2:
             st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
@@ -324,6 +420,6 @@ if uploaded_file is not None:
             st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
             st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
-    except ValueError:
-        st.error("An error occurred", icon="⚠️")
-        sys.exit(1)

 from burst_detection import burst_detection, enumerate_bursts, burst_weights
 import matplotlib.pyplot as plt
 import os
+import io
 import math
 import numpy as np
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+import plotly.io as pio
 import sys
 #===config===
 #===clear cache===
 def reset_all():
+    st.cache_data.clear()
 # Initialize NLP model
 nlp = spacy.load("en_core_web_md")
     excluded_words = [word.strip() for word in excluded_words_input.split(',')]
     # Identify top words, excluding specified words
     filtered_words = [word for word in yearly_term_frequency.columns if word not in excluded_words]
     top_words = yearly_term_frequency[filtered_words].sum().nlargest(top_n).index.tolist()
     num_unique_labels = len(all_bursts['label'].unique())
+    num_rows = math.ceil(top_n / 2)
     if running_total == "Running total":
         all_freq_data = all_freq_data.cumsum()
 @st.cache_data(ttl=3600)
 def convert_df(df):
     return df.to_csv().encode("utf-8")
+@st.cache_data(ttl=3600)
+def scattervis(bursts, freq_data):
+    freq_data.reset_index(inplace=True)
+    freq_data.rename(columns={"index": "Year"}, inplace=True)
+    freq_data_melted = freq_data.melt(id_vars=["Year"], var_name="Category", value_name="Value")
+    freq_data_melted = freq_data_melted[freq_data_melted["Value"] > 0]
+    wordlist = freq_data_melted["Category"].unique()
+    years = freq_data["Year"].tolist()
+    bursts["begin"] = bursts["begin"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
+    bursts["end"] = bursts["end"].apply(lambda x: years[min(x, len(years) - 1)] if x < len(years) else None)
+    burst_points = []
+    for _, row in bursts.iterrows():
+        for year in range(row["begin"], row["end"] + 1):
+            burst_points.append((year, row["label"], row["weight"]))
+    burst_points_df = pd.DataFrame(burst_points, columns=["Year", "Category", "Weight"])
+    fig = go.Figure()
+    # scatter trace for burst points
+    fig.add_trace(go.Scatter(
+        x=burst_points_df["Year"],
+        y=burst_points_df["Category"],
+        mode='markers',
+        marker=dict(
+            symbol='square',
+            size=40,
+            color='red',
+            opacity=0.5),
+        hoverinfo='text',
+        text=burst_points_df["Weight"],
+        showlegend=False
+    ))
+    # scatter trace for freq_data
+    fig.add_trace(go.Scatter(
+        x=freq_data_melted["Year"],
+        y=freq_data_melted["Category"],
+        mode='markers+text',
+        marker=dict(
+            symbol='square',
+            size=30,
+            color=freq_data_melted["Value"],
+            colorscale='Blues',
+            showscale=False),
+        text=freq_data_melted["Value"],
+        textposition="middle center",
+        textfont=dict(
+            size=16,
+            color=['white' if value > freq_data_melted["Value"].max()/2 else 'black' for value in freq_data_melted["Value"]])
+    ))
+    min_year = min(years)
+    max_year = max(years)
+    fig.update_layout(
+        xaxis=dict(tickmode='linear', dtick=1, range=[(min_year-1), (max_year+1)], tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
+        yaxis=dict(tickvals=wordlist, ticktext=wordlist, tickmode='array', tickfont = dict(size=16), automargin=True, showgrid=False, zeroline=False),
+        plot_bgcolor='white',
+        paper_bgcolor='white',
+        showlegend=False,
+        margin=dict(l=1, r=1, t=1, b=1),
+        height=top_n*50+2,
+        width=(max_year-min_year)*52+100,
+        autosize=False
+    )
+    fig.write_image("scatter_plot.png")
+    st.image("scatter_plot.png")
+    pio.write_image(fig, 'result.png', scale=4)
+@st.cache_data(ttl=3600)
+def linegraph(bursts, freq_data):
+    fig = make_subplots(rows=num_rows, cols=2, subplot_titles=freq_data.columns[:top_n])
+    row, col = 1, 1
+    for i, column in enumerate(freq_data.columns[:top_n]):
+        fig.add_trace(go.Scatter(
+            x=freq_data.index, y=freq_data[column], mode='lines+markers+text', name=column,
+            line_shape='linear',
+            hoverinfo='text',
+            hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
+            text=freq_data[column],
+            textposition='top center'
+        ), row=row, col=col)
+        # Add area charts
+        for _, row_data in bursts[bursts['label'] == column].iterrows():
+            x_values = freq_data.index[row_data['begin']:row_data['end']+1]
+            y_values = freq_data[column][row_data['begin']:row_data['end']+1]
+            #middle_y = sum(y_values) / len(y_values)
+            y_post = min(freq_data[column]) + 1 if running_total == "Running total" else sum(y_values) / len(y_values)
+            x_offset = 0.1
+            # Add area chart
+            fig.add_trace(go.Scatter(
+                x=x_values,
+                y=y_values,
+                fill='tozeroy', mode='lines', fillcolor='rgba(0,100,80,0.2)',
+            ), row=row, col=col)
+            align_value = "left" if running_total == "Running total" else "center"
+            valign_value = "bottom" if running_total == "Running total" else "middle"
+            # Add annotation for weight at the bottom
+            fig.add_annotation(
+                x=x_values[0] + x_offset,
+                y=y_post,
+                text=f"Weight: {row_data['weight']:.2f}",
+                showarrow=False,
+                font=dict(
+                    color="black",
+                    size=12),
+                align=align_value,
+                valign=valign_value,
+                textangle=270,
+                row=row, col=col
+                )
+        col += 1
+        if col > 2:
+            col = 1
+            row += 1
+    fig.update_layout(
+        showlegend=False,
+        margin=dict(l=20, r=20, t=100, b=20),
+        height=num_rows * 500,
+        width=1500
+    )
+    fig.write_image("line_graph.png")
+    st.image("line_graph.png")
+    pio.write_image(fig, 'result.png', scale=4)
+@st.cache_data(ttl=3600)
+def download_result(freq_data, bursts):
+    csv1 = convert_df(freq_data)
+    csv2 = convert_df(bursts)
+    return csv1, csv2
 uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
 if uploaded_file is not None:
     try:
+        c1, c2, c3 = st.columns([3,3.5,3.5])
+        top_n = c1.number_input("Number of top words to analyze", min_value=5, value=10, step=1, on_change=reset_all)
+        viz_selected = c2.selectbox("Option for visualization",
+            ("Line graph", "Scatter plot"), on_change=reset_all)
         running_total = c3.selectbox("Option for counting words",
             ("Running total", "By occurrences each year"), on_change=reset_all)
+        d1, d2 = st.columns([3,7])
         df, coldf, MIN, MAX, GAP = load_data(uploaded_file)
         col_name = d1.selectbox("Select column to analyze",
             (coldf), on_change=reset_all)
         if (GAP != 0):
             YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX), on_change=reset_all)
         else:
+            e1.write('You only have data in ', (MAX))
             sys.exit(1)
         yearly_term_frequency, top_words = clean_data(df)
         bursts, freq_data, num_unique_labels, num_rows = apply_burst_detection(top_words, yearly_term_frequency)
                     st.info(f'We detect a burst on {num_unique_labels} word(s)', icon="ℹ️")
                 elif num_unique_labels < top_n:
                     st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
+                if viz_selected == "Line graph":
+                    linegraph(bursts, freq_data)
+                elif viz_selected =="Scatter plot":
+                    scattervis(bursts, freq_data)
+                csv1, csv2 = download_result(freq_data, bursts)
+                e1, e2, e3 = st.columns(3)
+                with open('result.png', "rb") as file:
+                    btn = e1.download_button(
+                        label="📊 Download high resolution image",
+                        data=file,
+                        file_name="burst.png",
+                        mime="image/png")
                 e2.download_button(
+                    "👉 Press to download list of top words",
+                    csv1,
+                    "top-keywords.csv",
+                    "text/csv")
+                e3.download_button(
+                    "👉 Press to download the list of detected bursts",
+                    csv2,
+                    "burst.csv",
+                    "text/csv")
         with tab2:
             st.markdown('**Kleinberg, J. (2002). Bursty and hierarchical structure in streams. Knowledge Discovery and Data Mining.** https://doi.org/10.1145/775047.775061')
             st.markdown('**Domicián Máté, Ni Made Estiyanti and Novotny, A. (2024) ‘How to support innovative small firms? Bibliometric analysis and visualization of start-up incubation’, Journal of Innovation and Entrepreneurship, 13(1).** https://doi.org/10.1186/s13731-024-00361-z')
             st.markdown('**Lamba, M., Madhusudhan, M. (2022). Burst Detection. In: Text Mining for Information Professionals. Springer, Cham.** https://doi.org/10.1007/978-3-030-85085-2_6')
+    except:
+        st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
+        st.stop()