import streamlit as st from streamlit import session_state import numpy as np import json from io import StringIO import openai import json import os import pandas as pd from sentence_transformers import SentenceTransformer import nltk from nltk import word_tokenize from nltk.corpus import stopwords from sklearn.cluster import MiniBatchKMeans from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score from sklearn.cluster import AgglomerativeClustering,k_means from scipy.cluster.hierarchy import linkage, dendrogram, fcluster import numpy as np nltk.download("stopwords") import nltk nltk.download('punkt') #text preprocessing function def clean_text_1(text): stop_words = set(stopwords.words("english")) def remove_stopwords(text): return " ".join([word for word in str(text).split() if word not in stop_words]) text = remove_stopwords(text) text = str(text).lower() # Lowercase words text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content text = re.sub(r"\w+…|…", " ", text) # Remove ellipsis (and last word) text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words # text = re.sub(stop_words, " ", text) # Replace dash between words text = re.sub( f"[{re.escape(string.punctuation)}]", "", text ) # Remove punctuation return text import streamlit as st import pandas as pd import numpy as np from sklearn.cluster import AgglomerativeClustering from sklearn.manifold import TSNE import matplotlib.pyplot as plt import matplotlib.colors as mcolors from sentence_transformers import SentenceTransformer model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here from openai import OpenAI client = OpenAI() # Load sentence transformer model def get_embedding(text): # Assuming you have a function clean_text_1 to clean the text #text = clean_text_1(text) return model.encode(text) # Streamlit UI configuration st.set_page_config( page_title="text_clustering.py", page_icon="👋", ) # Upload file uploaded_file = st.file_uploader("Choose a file") if uploaded_file: # Read data from file df = pd.read_csv(uploaded_file) # Clean data df = df[df['text'].notna()].reset_index(drop=True) # Get embeddings df['embedding'] = df['text'].apply(get_embedding) matrix = np.vstack(df['embedding'].values) # Distance threshold slider distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1) # Perform clustering agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward') cluster_labels = agg_clustering.fit_predict(matrix) df['Cluster'] = cluster_labels # Visualize clusters with t-SNE tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200) vis_dims2 = tsne.fit_transform(matrix) x = [x for x, y in vis_dims2] y = [y for x, y in vis_dims2] unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True) # Create a colormap based on cluster sizes colormap = plt.cm.get_cmap("viridis", len(unique_clusters)) # Set up Streamlit app fig, ax = plt.subplots() for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)): xs = np.array(x)[cluster_labels == category] ys = np.array(y)[cluster_labels == category] ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})') avg_x = xs.mean() avg_y = ys.mean() ax.scatter(avg_x, avg_y, marker="x", color=color, s=100) ax.set_title("Clusters identified visualized in language 2D using t-SNE") ax.legend() # Display the plot in Streamlit st.pyplot(fig) st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist()))) # Reading a review which belong to each group. rev_per_cluster = 3 n_clusters = len(np.unique(cluster_labels.tolist())) for i in range(n_clusters): print(f"Cluster {i} Theme:", end=" ") reviews = "\n".join( df[df.Cluster == i] .text.str.replace("Title: ", "") .str.replace("\n\nContent: ", ": ") .sample(rev_per_cluster, random_state=42) .values ) messages = [ {"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'} ] response = client.chat.completions.create( model="gpt-4", messages=messages, temperature=0, max_tokens=64, top_p=1, frequency_penalty=0, presence_penalty=0) print(response.choices[0].message.content.replace("\n", "")) st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", "")) # sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42) # for j in range(rev_per_cluster): # print(sample_cluster_rows.Score.values[j], end=", ") # print(sample_cluster_rows.Summary.values[j], end=": ") # print(sample_cluster_rows.Text.str[:70].values[j]) # print("-" * 100) #