Spaces:
Sleeping
Sleeping
File size: 5,482 Bytes
1ef3d70 ec7b130 c219fea ec7b130 1ef3d70 c219fea 1ef3d70 c219fea 1ef3d70 6ea0857 1ef3d70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import streamlit as st
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")
from streamlit import session_state
import numpy as np
import json
from io import StringIO
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import AgglomerativeClustering,k_means
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import numpy as np
nltk.download("stopwords")
import nltk
nltk.download('punkt')
#text preprocessing function
def clean_text_1(text):
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stop_words])
text = remove_stopwords(text)
text = str(text).lower() # Lowercase words
text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content
text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
text = re.sub(r"\w+…|…", " ", text) # Remove ellipsis (and last word)
text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
# text = re.sub(stop_words, " ", text) # Replace dash between words
text = re.sub(
f"[{re.escape(string.punctuation)}]", "", text
) # Remove punctuation
return text
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
# Load sentence transformer model
def get_embedding(text):
# Assuming you have a function clean_text_1 to clean the text
#text = clean_text_1(text)
return model.encode(text)
# Streamlit UI configuration
st.set_page_config(
page_title="text_clustering.py",
page_icon="👋",
)
# Upload file
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file:
# Read data from file
df = pd.read_csv(uploaded_file)
# Clean data
df = df[df['text'].notna()].reset_index(drop=True)
# Get embeddings
df['embedding'] = df['text'].apply(get_embedding)
matrix = np.vstack(df['embedding'].values)
# Distance threshold slider
distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)
# Perform clustering
agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
cluster_labels = agg_clustering.fit_predict(matrix)
df['Cluster'] = cluster_labels
# Visualize clusters with t-SNE
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]
unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
# Create a colormap based on cluster sizes
colormap = plt.cm.get_cmap("viridis", len(unique_clusters))
# Set up Streamlit app
fig, ax = plt.subplots()
for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
xs = np.array(x)[cluster_labels == category]
ys = np.array(y)[cluster_labels == category]
ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')
avg_x = xs.mean()
avg_y = ys.mean()
ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
ax.set_title("Clusters identified visualized in language 2D using t-SNE")
ax.legend()
# Display the plot in Streamlit
st.pyplot(fig)
st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))
# Reading a review which belong to each group.
rev_per_cluster = 1
n_clusters = len(np.unique(cluster_labels.tolist()))
for i in range(n_clusters):
print(f"Cluster {i} Theme:", end=" ")
reviews = "\n".join(
df[df.Cluster == i]
.text.str.replace("Title: ", "")
.str.replace("\n\nContent: ", ": ")
.sample(rev_per_cluster, random_state=42)
.values
)
messages = [
{"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0,
max_tokens=64,
top_p=1,
frequency_penalty=0,
presence_penalty=0)
print(response.choices[0].message.content.replace("\n", ""))
st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))
# sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
# for j in range(rev_per_cluster):
# print(sample_cluster_rows.Score.values[j], end=", ")
# print(sample_cluster_rows.Summary.values[j], end=": ")
# print(sample_cluster_rows.Text.str[:70].values[j])
# print("-" * 100)
#
|