File size: 3,338 Bytes
fce83cf
 
 
 
 
 
 
335999f
629f196
 
 
fce83cf
 
fe78a4f
67cc9a7
 
 
 
 
fce83cf
d7135ca
fe78a4f
 
 
 
 
 
 
 
67cc9a7
629f196
 
 
fe78a4f
fce83cf
 
 
8fd248e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5eabe95
67cc9a7
8fd248e
67cc9a7
 
 
fce83cf
 
629f196
8fd248e
629f196
8fd248e
67cc9a7
 
fce83cf
629f196
8fd248e
66cf393
fce83cf
 
5eabe95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import streamlit as st
from transformers import pipeline
from sklearn.cluster import KMeans
import numpy as np

# Mock data
mock_words = [
    "apple", "banana", "cherry", "date",  # Fruits
    "car", "truck", "bus", "bicycle",  # Vehicles
    "red", "blue", "green", "yellow",  # Colors
    "cat", "dog", "rabbit", "hamster"  # Pets
]

# Define available models and load them
models = {
    'DistilBERT': 'distilbert-base-uncased',
    'BERT': 'bert-base-uncased',
    'RoBERTa': 'roberta-base'
}

@st.cache_resource
def load_models():
    pipelines = {}
    for name, model_name in models.items():
        pipelines[name] = pipeline('feature-extraction', model=model_name)
    return pipelines

pipelines = load_models()

def embed_words(words, model_name):
    """
    Embed the given words using the specified model and return the averaged embeddings.
    """
    embedder = pipelines[model_name]
    embeddings = embedder(words)
    return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])

def iterative_clustering(words, model_name):
    remaining_words = words[:]
    grouped_words = []

    while len(remaining_words) >= 4:
        embeddings = embed_words(remaining_words, model_name)
        kmeans = KMeans(n_clusters=min(4, len(remaining_words) // 4), random_state=0).fit(embeddings)
        clusters = {i: [] for i in range(kmeans.n_clusters)}
        for word, label in zip(remaining_words, kmeans.labels_):
            if len(clusters[label]) < 4:
                clusters[label].append(word)

        # Select the most cohesive cluster
        best_cluster, best_idx = select_most_cohesive_cluster(clusters, kmeans, embeddings)

        # Store the best cluster and remove those words
        grouped_words.append(best_cluster)
        remaining_words = [word for word in remaining_words if word not in best_cluster]

    return grouped_words

def select_most_cohesive_cluster(clusters, kmeans_model, embeddings):
    min_distance = float('inf')
    best_cluster = None
    best_idx = -1
    for idx, cluster in clusters.items():
        if len(cluster) == 4:
            cluster_embeddings = embeddings[[i for i, label in enumerate(kmeans_model.labels_) if label == idx]]
            centroid = kmeans_model.cluster_centers_[idx]
            distance = np.mean(np.linalg.norm(cluster_embeddings - centroid, axis=1))
            if distance < min_distance:
                min_distance = distance
                best_cluster = cluster
                best_idx = idx
    return best_cluster, best_idx

def display_clusters(clusters):
    for i, words in enumerate(clusters):
        st.markdown(f"### Group {i+1}")
        st.write(", ".join(words))

def main():
    st.title("NYT Connections Solver")
    st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
    st.write("Select an embedding model from the dropdown menu and click 'Generate Clusters' to see the grouped words.")
    
    # Dropdown menu for selecting the embedding model
    model_name = st.selectbox("Select Embedding Model", list(models.keys()))

    if st.button("Generate Clusters"):
        with st.spinner("Generating clusters..."):
            clusters = iterative_clustering(mock_words, model_name)
        display_clusters(clusters)

if __name__ == "__main__":
    main()