DrGabrielLopez commited on
Commit
a9f525a
1 Parent(s): e772cb4
Files changed (3) hide show
  1. app.py +154 -0
  2. data/ecomm500.csv +0 -0
  3. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import spacy
4
+ import gradio as gr
5
+ import umap
6
+ from sklearn.cluster import OPTICS
7
+ from transformers import BertTokenizer, TFBertModel
8
+ import plotly.io as pio
9
+
10
+ # configuration params
11
+ pio.templates.default = "plotly_dark"
12
+
13
+ # setting up the text in the page
14
+ TITLE = "<center><h1>BERTopic - For topics detection on text</h1></center>"
15
+ DESCRIPTION = r"""<center>Apply BERTopic to a given dataset end extract the most relevant topics.<br>
16
+ """
17
+ EXAMPLES = [
18
+ ["data/ecomm500.csv"],
19
+ ]
20
+ ARTICLE = r"""<center>
21
+ Done by dr. Gabriel Lopez<br>
22
+ This program follows the BERTopic philosophy, but actually has its own implementation.<br>
23
+ For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a><br>
24
+ For info about the BERTopic model can be <a href="https://maartengr.github.io/BERTopic/index.html">found here</a><br>
25
+ </center>"""
26
+
27
+
28
+ # load data
29
+ def load_data(path):
30
+ """Load CSV dataset"""
31
+ data = pd.read_csv(path, error_bad_lines=False)
32
+ assert "text" in data.columns, "The data must have a column named 'text'"
33
+ return data
34
+
35
+
36
+ def run_nlp_processing(data):
37
+ """As reference for standard NLP processing"""
38
+ import os
39
+
40
+ # NLP processing
41
+ docs = []
42
+ nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
43
+ for doc in nlp.pipe(data["text"].values, n_process=os.cpu_count() - 1):
44
+ lemmas = []
45
+ for token in doc:
46
+ if token.is_punct or token.is_stop:
47
+ continue
48
+ lemmas.append(token.lemma_.lower())
49
+ docs.append(" ".join(lemmas))
50
+ # Make new column
51
+ data = data.assign(text=docs)
52
+ return data
53
+
54
+
55
+ def run_bert_tokenization(data):
56
+ """Show the action of the WordPiece alogorithm"""
57
+ # load BERT model (for embeddings)
58
+ checkpoint = "bert-base-uncased"
59
+ tokenizer = BertTokenizer.from_pretrained(checkpoint)
60
+ model = TFBertModel.from_pretrained(checkpoint)
61
+ # Run BERT tokenizing + encoding
62
+ descr_processed_tokenized = tokenizer(
63
+ list(data["text"]),
64
+ return_tensors="tf",
65
+ truncation=True,
66
+ padding=True,
67
+ max_length=128,
68
+ )
69
+ data = data.assign(text_tokenized=descr_processed_tokenized)
70
+ return data
71
+
72
+
73
+ def run_bertopic(data):
74
+ """ " End-to-end BERTopic model"""
75
+ # load BERT model (for embeddings)
76
+ checkpoint = "bert-base-uncased"
77
+ tokenizer = BertTokenizer.from_pretrained(checkpoint)
78
+ model = TFBertModel.from_pretrained(checkpoint)
79
+ # Run BERT tokenizing + encoding
80
+ descr_processed_tokenized = tokenizer(
81
+ list(data["text"]),
82
+ return_tensors="tf",
83
+ truncation=True,
84
+ padding=True,
85
+ max_length=128,
86
+ )
87
+ output_bert = model(descr_processed_tokenized)
88
+ # Get sentence embeddings from BERTs word embeddings
89
+ mean_vect = []
90
+ for vect in output_bert.last_hidden_state:
91
+ mean_vect.append(np.mean(vect, axis=0))
92
+ data = data.assign(descr_vect=mean_vect)
93
+ # Use UMAP to lower the dimensionality of the embedding to 3D - [stack makes array(array()) --> array2d]
94
+ descr_vect_3d = umap.UMAP(n_components=3).fit_transform(
95
+ np.stack(data["descr_vect"].values)
96
+ )
97
+ data["descr_vect_2d"] = list(descr_vect_3d)
98
+ # Use BERT's + UMAP vector embeddings for clustering using OPTICS
99
+ clustering = OPTICS(min_samples=50).fit(np.stack(data["descr_vect_2d"].values))
100
+ data["cluster_label"] = clustering.labels_
101
+ # Plot the 3D embedding
102
+ fig_bertopic = plot_bertopic(descr_vect_3d, data)
103
+ # Extract topic wordclouds
104
+ return fig_bertopic
105
+
106
+
107
+ def plot_bertopic(descr_vect_3d, data):
108
+ """ " Show the topic clusters over an 3d embedding space"""
109
+ import plotly.express as px
110
+
111
+ fig = px.scatter_3d(
112
+ x=descr_vect_3d[:, 0],
113
+ y=descr_vect_3d[:, 1],
114
+ z=descr_vect_3d[:, 2],
115
+ color=data["cluster_label"],
116
+ )
117
+ return fig
118
+
119
+
120
+ # gradio interface
121
+ blocks = gr.Blocks()
122
+ with blocks:
123
+ # physical elements
124
+ session_state = gr.State([])
125
+ gr.Markdown(TITLE)
126
+ gr.Markdown(DESCRIPTION)
127
+ with gr.Row():
128
+ with gr.Column():
129
+ gr.Markdown(
130
+ "## Load the data (must be a csv file with a column named 'text')"
131
+ )
132
+ in_file = gr.File()
133
+ gr.Markdown("## Inspect the data")
134
+ in_data = gr.Dataframe()
135
+ submit_button = gr.Button("Run BERTopic!")
136
+ gr.Examples(inputs=in_file, examples=EXAMPLES)
137
+ with gr.Column():
138
+ gr.Markdown("## BERTopic Flow")
139
+ gr.Markdown(
140
+ "Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
141
+ )
142
+ gr.Markdown("## Processed Text")
143
+ out_dataset = gr.Dataframe()
144
+ gr.Markdown("## Embedding + Projection + Clustering")
145
+ embedding_plot = gr.Plot(label="BERTopic projections")
146
+ gr.Markdown("## Extracted Topics")
147
+ topics_text = gr.Textbox(label="Topics", lines=50)
148
+ gr.Markdown(ARTICLE)
149
+ # event listeners
150
+ in_file = in_file.change(inputs=in_file, outputs=in_data, fn=load_data)
151
+ submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
152
+ out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
153
+
154
+ blocks.launch()
data/ecomm500.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.23.0
2
+ numpy==1.23.5
3
+ pandas==1.5.3
4
+ plotly==5.13.1
5
+ scikit_learn==1.2.2
6
+ spacy==3.3.1
7
+ transformers==4.27.3
8
+ umap==0.1.1
9
+ umap_learn==0.5.3