Spaces:
Sleeping
Sleeping
File size: 5,447 Bytes
a9f525a 0bfcadb a9f525a bd9233d 66b46d9 0bfcadb a9f525a bd9233d a9f525a e9f4096 a9f525a 2b7c89f a9f525a bd9233d 0bfcadb 66b46d9 a9f525a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import pandas as pd
import numpy as np
import spacy
import os
import gradio as gr
import umap
from sklearn.cluster import OPTICS
from transformers import BertTokenizer, TFBertModel
import plotly.io as pio
# configuration params
pio.templates.default = "plotly_dark"
# setting up the text in the page
TITLE = "<center><h1>BERTopic - For topics detection on text</h1></center>"
DESCRIPTION = r"""<center>Apply BERTopic to a given dataset end extract the most relevant topics.<br>
"""
EXAMPLES = [
["data/ecomm500.csv"],
]
ARTICLE = r"""<center>
Done by dr. Gabriel Lopez<br>
This program follows the BERTopic philosophy, but actually has its own implementation.<br>
For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a><br>
For info about the BERTopic model can be <a href="https://maartengr.github.io/BERTopic/index.html">found here</a><br>
</center>"""
def load_data(fileobj):
"""Load dataset (keep only 500 rows for efficiency)"""
data = pd.read_csv(fileobj.name, on_bad_lines='skip', nrows=500)
assert "text" in data.columns, "The data must have a column named 'text'"
return data[['text']]
def run_nlp_processing(data):
"""As reference for standard NLP processing"""
# NLP processing
docs = []
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
for doc in nlp.pipe(data["text"].values, n_process=os.cpu_count() - 1):
lemmas = []
for token in doc:
if token.is_punct or token.is_stop:
continue
lemmas.append(token.lemma_.lower())
docs.append(" ".join(lemmas))
# Make new column
data = data.assign(text=docs)
return data
def run_bert_tokenization(data):
"""Show the action of the WordPiece alogorithm"""
# load BERT model (for embeddings)
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = TFBertModel.from_pretrained(checkpoint)
# Run BERT tokenizing + encoding
descr_processed_tokenized = tokenizer(
list(data["text"]),
return_tensors="tf",
truncation=True,
padding=True,
max_length=128,
)
data = data.assign(text_tokenized=descr_processed_tokenized)
return data
def run_bertopic(data):
""" " End-to-end BERTopic model"""
# load BERT model (for embeddings)
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = TFBertModel.from_pretrained(checkpoint)
# Run BERT tokenizing + encoding
descr_processed_tokenized = tokenizer(
list(data["text"]),
return_tensors="tf",
truncation=True,
padding=True,
max_length=128,
)
output_bert = model(descr_processed_tokenized)
# Get sentence embeddings from BERTs word embeddings
mean_vect = []
for vect in output_bert.last_hidden_state:
mean_vect.append(np.mean(vect, axis=0))
data = data.assign(descr_vect=mean_vect)
# Use UMAP to lower the dimensionality of the embedding to 3D - [stack makes array(array()) --> array2d]
descr_vect_3d = umap.UMAP(n_components=3).fit_transform(
np.stack(data["descr_vect"].values)
)
data["descr_vect_2d"] = list(descr_vect_3d)
# Use BERT's + UMAP vector embeddings for clustering using OPTICS
clustering = OPTICS(min_samples=50).fit(np.stack(data["descr_vect_2d"].values))
data["cluster_label"] = clustering.labels_
# Plot the 3D embedding
fig_bertopic = plot_bertopic(descr_vect_3d, data)
# Extract topic wordclouds
return fig_bertopic
def plot_bertopic(descr_vect_3d, data):
""" " Show the topic clusters over an 3d embedding space"""
import plotly.express as px
fig = px.scatter_3d(
x=descr_vect_3d[:, 0],
y=descr_vect_3d[:, 1],
z=descr_vect_3d[:, 2],
color=data["cluster_label"],
)
return fig
# gradio interface
blocks = gr.Blocks()
with blocks:
# physical elements
session_state = gr.State([])
gr.Markdown(TITLE)
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column():
gr.Markdown(
"## Load the data (must be a csv file with a column named 'text')"
)
in_file = gr.File()
gr.Markdown("## Inspect the data")
in_data = gr.Dataframe(row_count=5)
submit_button = gr.Button("Run BERTopic!")
gr.Examples(inputs=in_file, examples=EXAMPLES)
with gr.Column():
gr.Markdown("## BERTopic Flow")
gr.Markdown(
"Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
)
gr.Markdown("## Processed Text")
out_dataset = gr.Dataframe(row_count=5)
gr.Markdown("## Embedding + Projection + Clustering")
embedding_plot = gr.Plot(label="BERTopic projections")
gr.Markdown("## Extracted Topics")
topics_text = gr.Textbox(label="Topics", lines=50)
gr.Markdown(ARTICLE)
# event listeners
in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
# out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
blocks.launch()
|