Spaces:
Running
Running
seanpedrickcase
commited on
Commit
β’
90553eb
1
Parent(s):
5888649
App now retains original index following cleaning to allow for referring back to original data
Browse files- Dockerfile +2 -1
- README.md +1 -1
- app.py +1 -0
- funcs/anonymiser.py +6 -45
- funcs/embeddings.py +2 -1
- funcs/helper_functions.py +2 -1
- funcs/topic_core_funcs.py +5 -0
- requirements.txt +1 -1
Dockerfile
CHANGED
@@ -18,7 +18,8 @@ COPY requirements_aws.txt .
|
|
18 |
RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
|
19 |
&& pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
|
20 |
&& pip install --no-cache-dir --target=/install bertopic==0.16.2 --no-deps \
|
21 |
-
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt
|
|
|
22 |
|
23 |
# Add /install to the PYTHONPATH
|
24 |
ENV PYTHONPATH="/install:${PYTHONPATH}"
|
|
|
18 |
RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
|
19 |
&& pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
|
20 |
&& pip install --no-cache-dir --target=/install bertopic==0.16.2 --no-deps \
|
21 |
+
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt \
|
22 |
+
&& pip install --no-cache-dir --target=/install gradio=4.44.0
|
23 |
|
24 |
# Add /install to the PYTHONPATH
|
25 |
ENV PYTHONPATH="/install:${PYTHONPATH}"
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.44.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -5,6 +5,7 @@ import numpy as np
|
|
5 |
|
6 |
from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
|
7 |
from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
|
|
|
8 |
from sklearn.feature_extraction.text import CountVectorizer
|
9 |
from funcs.auth import authenticate_user, download_file_from_s3
|
10 |
|
|
|
5 |
|
6 |
from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
|
7 |
from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
|
8 |
+
from funcs.embeddings import make_or_load_embeddings
|
9 |
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
from funcs.auth import authenticate_user, download_file_from_s3
|
11 |
|
funcs/anonymiser.py
CHANGED
@@ -42,61 +42,22 @@ from presidio_anonymizer.entities import OperatorConfig
|
|
42 |
from typing import List
|
43 |
|
44 |
# Function to Split Text and Create DataFrame using SpaCy
|
45 |
-
def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
|
|
|
|
|
|
|
46 |
expanded_data = []
|
47 |
|
48 |
-
# if not custom_delimiters:
|
49 |
-
# custom_delimiters = []
|
50 |
-
|
51 |
df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
52 |
|
53 |
-
# sentencizer = Sentencizer()
|
54 |
-
|
55 |
-
# new_punct_chars = sentencizer.default_punct_chars
|
56 |
-
# new_punct_chars.extend(custom_delimiters)
|
57 |
-
|
58 |
-
# config = {"punct_chars": new_punct_chars}
|
59 |
-
# nlp.add_pipe("sentencizer", config=config)
|
60 |
-
|
61 |
for index, row in df.iterrows():
|
62 |
doc = nlp(row[colname])
|
63 |
for sent in doc.sents:
|
64 |
-
expanded_data.append({'document_index': row['index'], colname: sent.text})
|
65 |
return pd.DataFrame(expanded_data)
|
66 |
|
67 |
-
# def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
|
68 |
-
|
69 |
-
# #print("Custom delimiters:", custom_delimiters)
|
70 |
-
|
71 |
-
# expanded_data = []
|
72 |
-
# df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
73 |
-
|
74 |
-
# sentencizer = Sentencizer()
|
75 |
-
|
76 |
-
# new_punct_chars = sentencizer.default_punct_chars
|
77 |
-
# if custom_delimiters:
|
78 |
-
# new_punct_chars.extend(custom_delimiters)
|
79 |
-
|
80 |
-
# pattern = "(" + "|".join(re.escape(punct) for punct in new_punct_chars) + ")"
|
81 |
-
# #print("Patterns:", pattern)
|
82 |
-
# split_list = []
|
83 |
-
|
84 |
-
# for idx, string in enumerate(df[colname]):
|
85 |
-
# new_split = re.split(pattern, string)
|
86 |
-
# for n, sentence in enumerate(new_split):
|
87 |
-
# if sentence:
|
88 |
-
# # If there is a split delimiter in the 'sentence' after, add it to the previous sentence as it will be removed at a later step
|
89 |
-
# if n + 1 < len(new_split):
|
90 |
-
# if new_split[n + 1]:
|
91 |
-
# # If the next split is in the list of split characters, then add it to this current sentence
|
92 |
-
# if new_split[n + 1] in new_punct_chars:
|
93 |
-
# split_list.append({'document_index': idx, colname: sentence + new_split[n + 1]})
|
94 |
-
# else:
|
95 |
-
# split_list.append({'document_index': idx, colname: sentence})
|
96 |
-
|
97 |
-
# return pd.DataFrame(split_list)
|
98 |
|
99 |
-
def anon_consistent_names(df):
|
100 |
# ## Pick out common names and replace them with the same person value
|
101 |
df_dict = df.to_dict(orient="list")
|
102 |
|
|
|
42 |
from typing import List
|
43 |
|
44 |
# Function to Split Text and Create DataFrame using SpaCy
|
45 |
+
def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[str]=[], nlp=nlp):
|
46 |
+
'''
|
47 |
+
Expand passages into sentences using Spacy's built in NLP capabilities
|
48 |
+
'''
|
49 |
expanded_data = []
|
50 |
|
|
|
|
|
|
|
51 |
df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
for index, row in df.iterrows():
|
54 |
doc = nlp(row[colname])
|
55 |
for sent in doc.sents:
|
56 |
+
expanded_data.append({'original_index':row['original_index'],'document_index': row['index'], colname: sent.text})
|
57 |
return pd.DataFrame(expanded_data)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
def anon_consistent_names(df:pd.DataFrame):
|
61 |
# ## Pick out common names and replace them with the same person value
|
62 |
df_dict = df.to_dict(orient="list")
|
63 |
|
funcs/embeddings.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import time
|
2 |
import numpy as np
|
|
|
3 |
from torch import cuda, backends, version
|
4 |
|
5 |
# Check for torch cuda
|
@@ -12,7 +13,7 @@ if cuda.is_available():
|
|
12 |
torch_device = "gpu"
|
13 |
print("Cuda version installed is: ", version.cuda)
|
14 |
high_quality_mode = "Yes"
|
15 |
-
|
16 |
else:
|
17 |
torch_device = "cpu"
|
18 |
high_quality_mode = "No"
|
|
|
1 |
import time
|
2 |
import numpy as np
|
3 |
+
import os
|
4 |
from torch import cuda, backends, version
|
5 |
|
6 |
# Check for torch cuda
|
|
|
13 |
torch_device = "gpu"
|
14 |
print("Cuda version installed is: ", version.cuda)
|
15 |
high_quality_mode = "Yes"
|
16 |
+
os.system("nvidia-smi")
|
17 |
else:
|
18 |
torch_device = "cpu"
|
19 |
high_quality_mode = "No"
|
funcs/helper_functions.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
import gradio as gr
|
6 |
import gzip
|
7 |
import pickle
|
|
|
8 |
import numpy as np
|
9 |
from bertopic import BERTopic
|
10 |
from datetime import datetime
|
@@ -129,7 +130,7 @@ def read_file(filename):
|
|
129 |
print("Loading in file")
|
130 |
|
131 |
if file_type == 'csv':
|
132 |
-
file = pd.read_csv(filename
|
133 |
elif file_type == 'xlsx':
|
134 |
file = pd.read_excel(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
135 |
elif file_type == 'parquet':
|
|
|
5 |
import gradio as gr
|
6 |
import gzip
|
7 |
import pickle
|
8 |
+
import csv
|
9 |
import numpy as np
|
10 |
from bertopic import BERTopic
|
11 |
from datetime import datetime
|
|
|
130 |
print("Loading in file")
|
131 |
|
132 |
if file_type == 'csv':
|
133 |
+
file = pd.read_csv(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
134 |
elif file_type == 'xlsx':
|
135 |
file = pd.read_excel(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
136 |
elif file_type == 'parquet':
|
funcs/topic_core_funcs.py
CHANGED
@@ -93,6 +93,10 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
93 |
|
94 |
in_colnames_list_first = in_colnames[0]
|
95 |
|
|
|
|
|
|
|
|
|
96 |
if clean_text == "Yes":
|
97 |
clean_tic = time.perf_counter()
|
98 |
print("Starting data clean.")
|
@@ -343,6 +347,7 @@ def extract_topics(
|
|
343 |
if not candidate_topics:
|
344 |
|
345 |
try:
|
|
|
346 |
|
347 |
topic_model = BERTopic( embedding_model=embedding_model,
|
348 |
vectorizer_model=vectoriser_model,
|
|
|
93 |
|
94 |
in_colnames_list_first = in_colnames[0]
|
95 |
|
96 |
+
# Reset original index to a new column so you can link it to data outputted from cleaning
|
97 |
+
if not "original_index" in data.columns:
|
98 |
+
data = data.reset_index(names="original_index")
|
99 |
+
|
100 |
if clean_text == "Yes":
|
101 |
clean_tic = time.perf_counter()
|
102 |
print("Starting data clean.")
|
|
|
347 |
if not candidate_topics:
|
348 |
|
349 |
try:
|
350 |
+
# print("vectoriser_model:", vectoriser_model)
|
351 |
|
352 |
topic_model = BERTopic( embedding_model=embedding_model,
|
353 |
vectorizer_model=vectoriser_model,
|
requirements.txt
CHANGED
@@ -4,7 +4,7 @@ transformers==4.41.2
|
|
4 |
accelerate==0.26.1
|
5 |
torch==2.4.0
|
6 |
bertopic==0.16.2
|
7 |
-
spacy==3.7.
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
9 |
pyarrow==14.0.2
|
10 |
openpyxl==3.1.2
|
|
|
4 |
accelerate==0.26.1
|
5 |
torch==2.4.0
|
6 |
bertopic==0.16.2
|
7 |
+
spacy==3.7.5
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
9 |
pyarrow==14.0.2
|
10 |
openpyxl==3.1.2
|