File size: 5,447 Bytes
a9f525a
 
 
0bfcadb
a9f525a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd9233d
66b46d9
0bfcadb
a9f525a
bd9233d
a9f525a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9f4096
a9f525a
 
 
 
 
 
 
 
2b7c89f
a9f525a
 
 
 
 
 
bd9233d
0bfcadb
66b46d9
a9f525a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import pandas as pd
import numpy as np
import spacy
import os
import gradio as gr
import umap
from sklearn.cluster import OPTICS
from transformers import BertTokenizer, TFBertModel
import plotly.io as pio

# configuration params
pio.templates.default = "plotly_dark"

# setting up the text in the page
TITLE = "<center><h1>BERTopic - For topics detection on text</h1></center>"
DESCRIPTION = r"""<center>Apply BERTopic to a given dataset end extract the most relevant topics.<br>
                 """
EXAMPLES = [
    ["data/ecomm500.csv"],
]
ARTICLE = r"""<center>
              Done by dr. Gabriel Lopez<br>
              This program follows the BERTopic philosophy, but actually has its own implementation.<br>
              For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a><br>
              For info about the BERTopic model can be <a href="https://maartengr.github.io/BERTopic/index.html">found here</a><br>
              </center>"""


def load_data(fileobj):
    """Load dataset (keep only 500 rows for efficiency)"""
    data = pd.read_csv(fileobj.name, on_bad_lines='skip', nrows=500)
    assert "text" in data.columns, "The data must have a column named 'text'"
    return data[['text']]


def run_nlp_processing(data):
    """As reference for standard NLP processing"""
    # NLP processing
    docs = []
    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
    for doc in nlp.pipe(data["text"].values, n_process=os.cpu_count() - 1):
        lemmas = []
        for token in doc:
            if token.is_punct or token.is_stop:
                continue
            lemmas.append(token.lemma_.lower())
        docs.append(" ".join(lemmas))
    # Make new column
    data = data.assign(text=docs)
    return data


def run_bert_tokenization(data):
    """Show the action of the WordPiece alogorithm"""
    # load BERT model (for embeddings)
    checkpoint = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(checkpoint)
    model = TFBertModel.from_pretrained(checkpoint)
    # Run BERT tokenizing + encoding
    descr_processed_tokenized = tokenizer(
        list(data["text"]),
        return_tensors="tf",
        truncation=True,
        padding=True,
        max_length=128,
    )
    data = data.assign(text_tokenized=descr_processed_tokenized)
    return data


def run_bertopic(data):
    """ " End-to-end BERTopic model"""
    # load BERT model (for embeddings)
    checkpoint = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(checkpoint)
    model = TFBertModel.from_pretrained(checkpoint)
    # Run BERT tokenizing + encoding
    descr_processed_tokenized = tokenizer(
        list(data["text"]),
        return_tensors="tf",
        truncation=True,
        padding=True,
        max_length=128,
    )
    output_bert = model(descr_processed_tokenized)
    # Get sentence embeddings from BERTs word embeddings
    mean_vect = []
    for vect in output_bert.last_hidden_state:
        mean_vect.append(np.mean(vect, axis=0))
    data = data.assign(descr_vect=mean_vect)
    # Use UMAP to lower the dimensionality of the embedding to 3D - [stack makes array(array()) --> array2d]
    descr_vect_3d = umap.UMAP(n_components=3).fit_transform(
        np.stack(data["descr_vect"].values)
    )
    data["descr_vect_2d"] = list(descr_vect_3d)
    # Use BERT's + UMAP vector embeddings for clustering using OPTICS
    clustering = OPTICS(min_samples=50).fit(np.stack(data["descr_vect_2d"].values))
    data["cluster_label"] = clustering.labels_
    # Plot the 3D embedding
    fig_bertopic = plot_bertopic(descr_vect_3d, data)
    # Extract topic wordclouds
    return fig_bertopic


def plot_bertopic(descr_vect_3d, data):
    """ " Show the topic clusters over an 3d embedding space"""
    import plotly.express as px

    fig = px.scatter_3d(
        x=descr_vect_3d[:, 0],
        y=descr_vect_3d[:, 1],
        z=descr_vect_3d[:, 2],
        color=data["cluster_label"],
    )
    return fig


# gradio interface
blocks = gr.Blocks()
with blocks:
    # physical elements
    session_state = gr.State([])
    gr.Markdown(TITLE)
    gr.Markdown(DESCRIPTION)
    with gr.Row():
        with gr.Column():
            gr.Markdown(
                "## Load the data (must be a csv file with a column named 'text')"
            )
            in_file = gr.File()
            gr.Markdown("## Inspect the data")
            in_data = gr.Dataframe(row_count=5)
            submit_button = gr.Button("Run BERTopic!")
            gr.Examples(inputs=in_file, examples=EXAMPLES)
        with gr.Column():
            gr.Markdown("## BERTopic Flow")
            gr.Markdown(
                "Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
            )
            gr.Markdown("## Processed Text")
            out_dataset = gr.Dataframe(row_count=5)
            gr.Markdown("## Embedding + Projection + Clustering")
            embedding_plot = gr.Plot(label="BERTopic projections")
            gr.Markdown("## Extracted Topics")
            topics_text = gr.Textbox(label="Topics", lines=50)
    gr.Markdown(ARTICLE)
    # event listeners
    in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
    submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
    # out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)

blocks.launch()