Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import spacy | |
import os | |
import gradio as gr | |
import umap | |
from sklearn.cluster import OPTICS | |
from transformers import BertTokenizer, TFBertModel | |
import plotly.io as pio | |
# configuration params | |
pio.templates.default = "plotly_dark" | |
# setting up the text in the page | |
TITLE = "<center><h1>BERTopic - For topics detection on text</h1></center>" | |
DESCRIPTION = r"""<center>Apply BERTopic to a given dataset end extract the most relevant topics.<br> | |
""" | |
EXAMPLES = [ | |
["data/ecomm500.csv"], | |
] | |
ARTICLE = r"""<center> | |
Done by dr. Gabriel Lopez<br> | |
This program follows the BERTopic philosophy, but actually has its own implementation.<br> | |
For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a><br> | |
For info about the BERTopic model can be <a href="https://maartengr.github.io/BERTopic/index.html">found here</a><br> | |
</center>""" | |
def load_data(fileobj): | |
"""Load dataset (keep only 500 rows for efficiency)""" | |
data = pd.read_csv(fileobj.name, on_bad_lines='skip', nrows=500) | |
assert "text" in data.columns, "The data must have a column named 'text'" | |
return data[['text']] | |
def run_nlp_processing(data): | |
"""As reference for standard NLP processing""" | |
# NLP processing | |
docs = [] | |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) | |
for doc in nlp.pipe(data["text"].values, n_process=os.cpu_count() - 1): | |
lemmas = [] | |
for token in doc: | |
if token.is_punct or token.is_stop: | |
continue | |
lemmas.append(token.lemma_.lower()) | |
docs.append(" ".join(lemmas)) | |
# Make new column | |
data = data.assign(text=docs) | |
return data | |
def run_bert_tokenization(data): | |
"""Show the action of the WordPiece alogorithm""" | |
# load BERT model (for embeddings) | |
checkpoint = "bert-base-uncased" | |
tokenizer = BertTokenizer.from_pretrained(checkpoint) | |
model = TFBertModel.from_pretrained(checkpoint) | |
# Run BERT tokenizing + encoding | |
descr_processed_tokenized = tokenizer( | |
list(data["text"]), | |
return_tensors="tf", | |
truncation=True, | |
padding=True, | |
max_length=128, | |
) | |
data = data.assign(text_tokenized=descr_processed_tokenized) | |
return data | |
def run_bertopic(data): | |
""" " End-to-end BERTopic model""" | |
# load BERT model (for embeddings) | |
checkpoint = "bert-base-uncased" | |
tokenizer = BertTokenizer.from_pretrained(checkpoint) | |
model = TFBertModel.from_pretrained(checkpoint) | |
# Run BERT tokenizing + encoding | |
descr_processed_tokenized = tokenizer( | |
list(data["text"]), | |
return_tensors="tf", | |
truncation=True, | |
padding=True, | |
max_length=128, | |
) | |
output_bert = model(descr_processed_tokenized) | |
# Get sentence embeddings from BERTs word embeddings | |
mean_vect = [] | |
for vect in output_bert.last_hidden_state: | |
mean_vect.append(np.mean(vect, axis=0)) | |
data = data.assign(descr_vect=mean_vect) | |
# Use UMAP to lower the dimensionality of the embedding to 3D - [stack makes array(array()) --> array2d] | |
descr_vect_3d = umap.UMAP(n_components=3).fit_transform( | |
np.stack(data["descr_vect"].values) | |
) | |
data["descr_vect_2d"] = list(descr_vect_3d) | |
# Use BERT's + UMAP vector embeddings for clustering using OPTICS | |
clustering = OPTICS(min_samples=50).fit(np.stack(data["descr_vect_2d"].values)) | |
data["cluster_label"] = clustering.labels_ | |
# Plot the 3D embedding | |
fig_bertopic = plot_bertopic(descr_vect_3d, data) | |
# Extract topic wordclouds | |
return fig_bertopic | |
def plot_bertopic(descr_vect_3d, data): | |
""" " Show the topic clusters over an 3d embedding space""" | |
import plotly.express as px | |
fig = px.scatter_3d( | |
x=descr_vect_3d[:, 0], | |
y=descr_vect_3d[:, 1], | |
z=descr_vect_3d[:, 2], | |
color=data["cluster_label"], | |
) | |
return fig | |
# gradio interface | |
blocks = gr.Blocks() | |
with blocks: | |
# physical elements | |
session_state = gr.State([]) | |
gr.Markdown(TITLE) | |
gr.Markdown(DESCRIPTION) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown( | |
"## Load the data (must be a csv file with a column named 'text')" | |
) | |
in_file = gr.File() | |
gr.Markdown("## Inspect the data") | |
in_data = gr.Dataframe(row_count=5) | |
submit_button = gr.Button("Run BERTopic!") | |
gr.Examples(inputs=in_file, examples=EXAMPLES) | |
with gr.Column(): | |
gr.Markdown("## BERTopic Flow") | |
gr.Markdown( | |
"Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic" | |
) | |
gr.Markdown("## Processed Text") | |
out_dataset = gr.Dataframe(row_count=5) | |
gr.Markdown("## Embedding + Projection + Clustering") | |
embedding_plot = gr.Plot(label="BERTopic projections") | |
gr.Markdown("## Extracted Topics") | |
topics_text = gr.Textbox(label="Topics", lines=50) | |
gr.Markdown(ARTICLE) | |
# event listeners | |
in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data) | |
submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization) | |
# out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic) | |
blocks.launch() | |