DrGabrielLopez commited on
Commit
0bfcadb
1 Parent(s): bd9233d
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. app.py +5 -6
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import pandas as pd
2
  import numpy as np
3
  import spacy
 
4
  import gradio as gr
5
  import umap
6
  from sklearn.cluster import OPTICS
@@ -27,15 +28,13 @@ ARTICLE = r"""<center>
27
 
28
  def load_data(fileobj):
29
  """Load dataset (keep only 500 rows for efficiency)"""
30
- data = pd.read_csv(fileobj, on_bad_lines='skip', nrows=500)
31
  assert "text" in data.columns, "The data must have a column named 'text'"
32
  return data[['text']]
33
 
34
 
35
  def run_nlp_processing(data):
36
  """As reference for standard NLP processing"""
37
- import os
38
-
39
  # NLP processing
40
  docs = []
41
  nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
@@ -130,7 +129,7 @@ with blocks:
130
  )
131
  in_file = gr.File()
132
  gr.Markdown("## Inspect the data")
133
- in_data = gr.Dataframe()
134
  submit_button = gr.Button("Run BERTopic!")
135
  gr.Examples(inputs=in_file, examples=EXAMPLES)
136
  with gr.Column():
@@ -139,7 +138,7 @@ with blocks:
139
  "Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
140
  )
141
  gr.Markdown("## Processed Text")
142
- out_dataset = gr.Dataframe()
143
  gr.Markdown("## Embedding + Projection + Clustering")
144
  embedding_plot = gr.Plot(label="BERTopic projections")
145
  gr.Markdown("## Extracted Topics")
@@ -147,7 +146,7 @@ with blocks:
147
  gr.Markdown(ARTICLE)
148
  # event listeners
149
  in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
150
- # submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
151
  # out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
152
 
153
  blocks.launch()
 
1
  import pandas as pd
2
  import numpy as np
3
  import spacy
4
+ import os
5
  import gradio as gr
6
  import umap
7
  from sklearn.cluster import OPTICS
 
28
 
29
  def load_data(fileobj):
30
  """Load dataset (keep only 500 rows for efficiency)"""
31
+ data = pd.read_csv(fileobj.name, on_bad_lines='skip', nrows=500)
32
  assert "text" in data.columns, "The data must have a column named 'text'"
33
  return data[['text']]
34
 
35
 
36
  def run_nlp_processing(data):
37
  """As reference for standard NLP processing"""
 
 
38
  # NLP processing
39
  docs = []
40
  nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
 
129
  )
130
  in_file = gr.File()
131
  gr.Markdown("## Inspect the data")
132
+ in_data = gr.Dataframe(max_rows=5)
133
  submit_button = gr.Button("Run BERTopic!")
134
  gr.Examples(inputs=in_file, examples=EXAMPLES)
135
  with gr.Column():
 
138
  "Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
139
  )
140
  gr.Markdown("## Processed Text")
141
+ out_dataset = gr.Dataframe(max_rows=5)
142
  gr.Markdown("## Embedding + Projection + Clustering")
143
  embedding_plot = gr.Plot(label="BERTopic projections")
144
  gr.Markdown("## Extracted Topics")
 
146
  gr.Markdown(ARTICLE)
147
  # event listeners
148
  in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
149
+ submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
150
  # out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
151
 
152
  blocks.launch()