DataAIDemo

Sleeping

File size: 5,482 Bytes

import streamlit as st
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")

from streamlit import session_state
import numpy as np
import json
from io import StringIO
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import AgglomerativeClustering,k_means
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import numpy as np
nltk.download("stopwords")
import nltk
nltk.download('punkt')
#text preprocessing function
def clean_text_1(text):
    stop_words = set(stopwords.words("english"))
    def remove_stopwords(text):
      return " ".join([word for word in str(text).split() if word not in stop_words])
    text = remove_stopwords(text)
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", " ", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", " ", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    # text = re.sub(stop_words, " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation
    return text
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here

# Load sentence transformer model
def get_embedding(text):
    # Assuming you have a function clean_text_1 to clean the text
    #text = clean_text_1(text)
    return model.encode(text)

# Streamlit UI configuration
st.set_page_config(
    page_title="text_clustering.py",
    page_icon="👋",
)

# Upload file
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file:
    # Read data from file
    df = pd.read_csv(uploaded_file)
    
    # Clean data
    df = df[df['text'].notna()].reset_index(drop=True)
    
    # Get embeddings
    df['embedding'] = df['text'].apply(get_embedding)
    matrix = np.vstack(df['embedding'].values)

    # Distance threshold slider
    distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)

    # Perform clustering
    agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
    cluster_labels = agg_clustering.fit_predict(matrix)
    df['Cluster'] = cluster_labels
    
    # Visualize clusters with t-SNE
    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
    vis_dims2 = tsne.fit_transform(matrix)

    x = [x for x, y in vis_dims2]
    y = [y for x, y in vis_dims2]

    unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)

    # Create a colormap based on cluster sizes
    colormap = plt.cm.get_cmap("viridis", len(unique_clusters))

    # Set up Streamlit app

    fig, ax = plt.subplots()
    for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
        xs = np.array(x)[cluster_labels == category]
        ys = np.array(y)[cluster_labels == category]

        ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')

        avg_x = xs.mean()
        avg_y = ys.mean()

        ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)

    ax.set_title("Clusters identified visualized in language 2D using t-SNE")
    ax.legend()

    # Display the plot in Streamlit
    st.pyplot(fig)
    st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))


    # Reading a review which belong to each group.
    rev_per_cluster = 1
    n_clusters = len(np.unique(cluster_labels.tolist()))

    for i in range(n_clusters):
        print(f"Cluster {i} Theme:", end=" ")

        reviews = "\n".join(
            df[df.Cluster == i]
            .text.str.replace("Title: ", "")
            .str.replace("\n\nContent: ", ":  ")
            .sample(rev_per_cluster, random_state=42)
            .values
        )

        messages = [
            {"role": "user", "content": f'What do the following  have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0,
            max_tokens=64,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0)
        print(response.choices[0].message.content.replace("\n", ""))
        st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))

#         sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
#         for j in range(rev_per_cluster):
#             print(sample_cluster_rows.Score.values[j], end=", ")
#             print(sample_cluster_rows.Summary.values[j], end=":   ")
#             print(sample_cluster_rows.Text.str[:70].values[j])

#         print("-" * 100)
#