Spaces:

DrGabrielLopez
/

BERTopic

Running

App Files Files Community

DrGabrielLopez commited on Mar 24, 2023

Commit

a9f525a

•

1 Parent(s): e772cb4

hi

Browse files

Files changed (3) hide show

app.py +154 -0
data/ecomm500.csv +0 -0
requirements.txt +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import pandas as pd
+import numpy as np
+import spacy
+import gradio as gr
+import umap
+from sklearn.cluster import OPTICS
+from transformers import BertTokenizer, TFBertModel
+import plotly.io as pio
+# configuration params
+pio.templates.default = "plotly_dark"
+# setting up the text in the page
+TITLE = "<center><h1>BERTopic - For topics detection on text</h1></center>"
+DESCRIPTION = r"""<center>Apply BERTopic to a given dataset end extract the most relevant topics.<br>
+                 """
+EXAMPLES = [
+    ["data/ecomm500.csv"],
+]
+ARTICLE = r"""<center>
+              Done by dr. Gabriel Lopez<br>
+              This program follows the BERTopic philosophy, but actually has its own implementation.<br>
+              For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a><br>
+              For info about the BERTopic model can be <a href="https://maartengr.github.io/BERTopic/index.html">found here</a><br>
+              </center>"""
+# load data
+def load_data(path):
+    """Load CSV dataset"""
+    data = pd.read_csv(path, error_bad_lines=False)
+    assert "text" in data.columns, "The data must have a column named 'text'"
+    return data
+def run_nlp_processing(data):
+    """As reference for standard NLP processing"""
+    import os
+    # NLP processing
+    docs = []
+    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
+    for doc in nlp.pipe(data["text"].values, n_process=os.cpu_count() - 1):
+        lemmas = []
+        for token in doc:
+            if token.is_punct or token.is_stop:
+                continue
+            lemmas.append(token.lemma_.lower())
+        docs.append(" ".join(lemmas))
+    # Make new column
+    data = data.assign(text=docs)
+    return data
+def run_bert_tokenization(data):
+    """Show the action of the WordPiece alogorithm"""
+    # load BERT model (for embeddings)
+    checkpoint = "bert-base-uncased"
+    tokenizer = BertTokenizer.from_pretrained(checkpoint)
+    model = TFBertModel.from_pretrained(checkpoint)
+    # Run BERT tokenizing + encoding
+    descr_processed_tokenized = tokenizer(
+        list(data["text"]),
+        return_tensors="tf",
+        truncation=True,
+        padding=True,
+        max_length=128,
+    )
+    data = data.assign(text_tokenized=descr_processed_tokenized)
+    return data
+def run_bertopic(data):
+    """ " End-to-end BERTopic model"""
+    # load BERT model (for embeddings)
+    checkpoint = "bert-base-uncased"
+    tokenizer = BertTokenizer.from_pretrained(checkpoint)
+    model = TFBertModel.from_pretrained(checkpoint)
+    # Run BERT tokenizing + encoding
+    descr_processed_tokenized = tokenizer(
+        list(data["text"]),
+        return_tensors="tf",
+        truncation=True,
+        padding=True,
+        max_length=128,
+    )
+    output_bert = model(descr_processed_tokenized)
+    # Get sentence embeddings from BERTs word embeddings
+    mean_vect = []
+    for vect in output_bert.last_hidden_state:
+        mean_vect.append(np.mean(vect, axis=0))
+    data = data.assign(descr_vect=mean_vect)
+    # Use UMAP to lower the dimensionality of the embedding to 3D - [stack makes array(array()) --> array2d]
+    descr_vect_3d = umap.UMAP(n_components=3).fit_transform(
+        np.stack(data["descr_vect"].values)
+    )
+    data["descr_vect_2d"] = list(descr_vect_3d)
+    # Use BERT's + UMAP vector embeddings for clustering using OPTICS
+    clustering = OPTICS(min_samples=50).fit(np.stack(data["descr_vect_2d"].values))
+    data["cluster_label"] = clustering.labels_
+    # Plot the 3D embedding
+    fig_bertopic = plot_bertopic(descr_vect_3d, data)
+    # Extract topic wordclouds
+    return fig_bertopic
+def plot_bertopic(descr_vect_3d, data):
+    """ " Show the topic clusters over an 3d embedding space"""
+    import plotly.express as px
+    fig = px.scatter_3d(
+        x=descr_vect_3d[:, 0],
+        y=descr_vect_3d[:, 1],
+        z=descr_vect_3d[:, 2],
+        color=data["cluster_label"],
+    )
+    return fig
+# gradio interface
+blocks = gr.Blocks()
+with blocks:
+    # physical elements
+    session_state = gr.State([])
+    gr.Markdown(TITLE)
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                "## Load the data (must be a csv file with a column named 'text')"
+            )
+            in_file = gr.File()
+            gr.Markdown("## Inspect the data")
+            in_data = gr.Dataframe()
+            submit_button = gr.Button("Run BERTopic!")
+            gr.Examples(inputs=in_file, examples=EXAMPLES)
+        with gr.Column():
+            gr.Markdown("## BERTopic Flow")
+            gr.Markdown(
+                "Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
+            )
+            gr.Markdown("## Processed Text")
+            out_dataset = gr.Dataframe()
+            gr.Markdown("## Embedding + Projection + Clustering")
+            embedding_plot = gr.Plot(label="BERTopic projections")
+            gr.Markdown("## Extracted Topics")
+            topics_text = gr.Textbox(label="Topics", lines=50)
+    gr.Markdown(ARTICLE)
+    # event listeners
+    in_file = in_file.change(inputs=in_file, outputs=in_data, fn=load_data)
+    submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
+    out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
+blocks.launch()

data/ecomm500.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==3.23.0
+numpy==1.23.5
+pandas==1.5.3
+plotly==5.13.1
+scikit_learn==1.2.2
+spacy==3.3.1
+transformers==4.27.3
+umap==0.1.1
+umap_learn==0.5.3