Spaces:
Runtime error
Runtime error
jharrison27
commited on
Commit
•
66cf393
1
Parent(s):
e2775c7
fix looping
Browse files
app.py
CHANGED
@@ -1,15 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
-
import logging
|
3 |
from transformers import pipeline
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
6 |
from sklearn.cluster import KMeans
|
7 |
import numpy as np
|
8 |
|
9 |
-
# Setting up logging
|
10 |
-
logging.basicConfig(level=logging.INFO)
|
11 |
-
logger = logging.getLogger(__name__)
|
12 |
-
|
13 |
# Mock data
|
14 |
mock_words = [
|
15 |
"apple", "banana", "cherry", "date", # Fruits
|
@@ -27,7 +22,6 @@ models = {
|
|
27 |
|
28 |
@st.cache_resource
|
29 |
def load_models():
|
30 |
-
logger.info("Loading models...")
|
31 |
pipelines = {}
|
32 |
for name, model_name in models.items():
|
33 |
pipelines[name] = pipeline('feature-extraction', model=model_name)
|
@@ -39,37 +33,28 @@ def embed_words(words, model_name):
|
|
39 |
"""
|
40 |
Embed the given words using the specified model and return the averaged embeddings.
|
41 |
"""
|
42 |
-
logger.info(f"Embedding words using model {model_name}...")
|
43 |
embedder = pipelines[model_name]
|
44 |
embeddings = embedder(words)
|
45 |
return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
|
46 |
|
47 |
-
def
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
for i in range(1, 5):
|
63 |
-
cluster = [word for idx, word in enumerate(remaining_words) if labels[idx] == i]
|
64 |
-
if len(cluster) == 4:
|
65 |
-
grouped_words.append(cluster)
|
66 |
-
remaining_words = [word for word in remaining_words if word not in cluster]
|
67 |
-
break
|
68 |
-
return grouped_words
|
69 |
|
70 |
def display_clusters(clusters):
|
71 |
-
|
72 |
-
for i, words in enumerate(clusters):
|
73 |
st.markdown(f"### Group {i+1}")
|
74 |
st.write(", ".join(words))
|
75 |
|
@@ -83,8 +68,8 @@ def main():
|
|
83 |
|
84 |
if st.button("Generate Clusters"):
|
85 |
with st.spinner("Generating clusters..."):
|
86 |
-
clusters =
|
87 |
-
|
88 |
|
89 |
if __name__ == "__main__":
|
90 |
main()
|
|
|
1 |
import streamlit as st
|
|
|
2 |
from transformers import pipeline
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
|
5 |
from sklearn.cluster import KMeans
|
6 |
import numpy as np
|
7 |
|
|
|
|
|
|
|
|
|
8 |
# Mock data
|
9 |
mock_words = [
|
10 |
"apple", "banana", "cherry", "date", # Fruits
|
|
|
22 |
|
23 |
@st.cache_resource
|
24 |
def load_models():
|
|
|
25 |
pipelines = {}
|
26 |
for name, model_name in models.items():
|
27 |
pipelines[name] = pipeline('feature-extraction', model=model_name)
|
|
|
33 |
"""
|
34 |
Embed the given words using the specified model and return the averaged embeddings.
|
35 |
"""
|
|
|
36 |
embedder = pipelines[model_name]
|
37 |
embeddings = embedder(words)
|
38 |
return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
|
39 |
|
40 |
+
def cluster_words(words, model_name, method):
|
41 |
+
embeddings = embed_words(words, model_name)
|
42 |
+
if method == 'Cosine Similarity':
|
43 |
+
# Use cosine similarity and hierarchical clustering
|
44 |
+
sim_matrix = cosine_similarity(embeddings)
|
45 |
+
Z = linkage(sim_matrix, 'average', metric='cosine')
|
46 |
+
labels = fcluster(Z, t=4, criterion='maxclust')
|
47 |
+
elif method == 'K-means':
|
48 |
+
# Use K-means clustering
|
49 |
+
kmeans = KMeans(n_clusters=4, random_state=0).fit(embeddings)
|
50 |
+
labels = kmeans.labels_ + 1
|
51 |
+
clusters = {i: [] for i in range(1, 5)}
|
52 |
+
for word, label in zip(words, labels):
|
53 |
+
clusters[label].append(word)
|
54 |
+
return clusters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
def display_clusters(clusters):
|
57 |
+
for i, words in clusters.items():
|
|
|
58 |
st.markdown(f"### Group {i+1}")
|
59 |
st.write(", ".join(words))
|
60 |
|
|
|
68 |
|
69 |
if st.button("Generate Clusters"):
|
70 |
with st.spinner("Generating clusters..."):
|
71 |
+
clusters = cluster_words(mock_words, model_name, clustering_method)
|
72 |
+
display_clusters(clusters)
|
73 |
|
74 |
if __name__ == "__main__":
|
75 |
main()
|