File size: 4,192 Bytes
2ddc64d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from typing import List
from .config import BaselineConfig, Configuration
from ..utils import __create_model__
import numpy as np
# from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# from yellowbrick.cluster import KElbowVisualizer
from .clusters import ClusterList
from unsupervised_learning.clustering import GaussianMixture, Silhouette

class ClusterPipeline:
    def __init__(self, config:Configuration = None):
        if config is None:
            self.__setup__(BaselineConfig())
        else:
            self.__setup__(config)

    def __setup__(self, config:Configuration):
        self.PTM = __create_model__(config.plm)
        self.dimension_reduction = __create_model__(config.dimension_reduction)
        self.clustering = __create_model__(config.clustering)
        self.keywords_extraction = __create_model__(config.keywords_extraction)

    def __1_generate_word_embeddings__(self, documents: List[str]):
        '''

        :param documents: a list of N strings:
        :return: np.ndarray: Nx384 (sentence-transformers)
        '''
        print(f'>>> start generating word embeddings...')
        print(f'>>> successfully generated word embeddings...')
        return self.PTM.encode(documents)

    def __2_dimenstion_reduction__(self, embeddings):
        '''

        :param embeddings: NxD
        :return: Nxd, d<<D
        '''
        if self.dimension_reduction is None:
            return embeddings
        print(f'>>> start dimension reduction...')
        embeddings = self.dimension_reduction.dimension_reduction(embeddings)
        print(f'>>> finished dimension reduction...')
        return embeddings

    def __3_clustering__(self, embeddings, return_cluster_centers = False, max_k: int =10, standarization = False):
        '''

        :param embeddings: Nxd
        :return:
        '''
        if self.clustering is None:
            return embeddings
        else:
            print(f'>>> start clustering...')

            ######## new: standarization ########
            if standarization:
                print(f'>>> start standardization...')
                scaler = StandardScaler()
                embeddings = scaler.fit_transform(embeddings)
                print(f'>>> finished standardization...')
            ######## new: standarization ########

            best_k_algo = Silhouette(GaussianMixture,2,max_k)
            best_k = best_k_algo.get_best_k(embeddings)
            print(f'>>> The best K is {best_k}.')

            labels, cluster_centers = self.clustering(embeddings, k=best_k)
            clusters = ClusterList(best_k)
            clusters.instantiate(labels)
            print(f'>>> finished clustering...')

            if return_cluster_centers:
                return clusters, cluster_centers
            return clusters

    def __4_keywords_extraction__(self, clusters: ClusterList, documents: List[str]):
        '''

        :param clusters: N documents
        :return: clusters, where each cluster has added keyphrases
        '''
        if self.keywords_extraction is None:
            return clusters
        else:
            print(f'>>> start keywords extraction')
            for cluster in clusters:
                doc_ids = cluster.elements()
                input_abstracts = [documents[i] for i in doc_ids] #[str]
                keyphrases = self.keywords_extraction(input_abstracts) #[{keys...}]
                cluster.add_keyphrase(keyphrases)
                # for doc_id in doc_ids:
                #     keyphrases = self.keywords_extraction(documents[doc_id])
                #     cluster.add_keyphrase(keyphrases)
            print(f'>>> finished keywords extraction')
            return clusters


    def __call__(self, documents: List[str], max_k:int, standarization = False):
        print(f'>>> pipeline starts...')
        x = self.__1_generate_word_embeddings__(documents)
        x = self.__2_dimenstion_reduction__(x)
        clusters = self.__3_clustering__(x,max_k=max_k,standarization=standarization)
        outputs = self.__4_keywords_extraction__(clusters, documents)
        print(f'>>> pipeline finished!\n')
        return outputs