Spaces:
Configuration error
Configuration error
# Copyright 2022 Christopher K. Schmitt | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from sentence_transformers import SentenceTransformer | |
from sklearn.manifold import TSNE | |
from sklearn.cluster import DBSCAN | |
from sklearn.metrics import silhouette_score, calinski_harabasz_score | |
from pathlib import Path | |
from bs4 import BeautifulSoup | |
from argparse import ArgumentParser | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import nltk as nltk | |
# The list of huggingface transformers with tensorflow | |
# support and compatible tokenizers. | |
available_models = { | |
"bert": "sentence-transformers/multi-qa-distilbert-cos-v1", | |
"albert": "sentence-transformers/paraphrase-albert-small-v2", | |
"roberta": "sentence-transformers/all-distilroberta-v1", | |
} | |
display_titles = { | |
"bert": "BERT", | |
"albert": "ALBERT", | |
"roberta": "RoBERTa", | |
} | |
# Define the CLI interface for modeling our data with | |
# different transformer models. We want to control the | |
# type of the tokenizer and the transformer we use, as well | |
# as the input and output directories | |
parser = ArgumentParser() | |
parser.add_argument("-m", "--model", choices=available_models.keys(), required=True) | |
parser.add_argument("-i", "--input", required=True) | |
parser.add_argument("-o", "--output", required=True) | |
args = parser.parse_args() | |
input_dir = args.input | |
output_dir = args.output | |
model_name = available_models[args.model] | |
display_name = display_titles[args.model] | |
# To remove random glyphs and other noise, we | |
# only extract words in the nltk corpus | |
nltk.download("words") | |
words = set(nltk.corpus.words.words()) | |
def extract_words(document): | |
cleaned = "" | |
for word in nltk.wordpunct_tokenize(document): | |
if word.lower() in words: | |
cleaned += word.lower() + " " | |
return cleaned | |
# Iterate over all of the files in the provided data | |
# directory. Parse each file with beautiful soup to parse | |
# the relevant text out of the markup. | |
data = Path(input_dir).iterdir() | |
data = map(lambda doc: doc.read_bytes(), data) | |
data = map(lambda doc: BeautifulSoup(doc, "html.parser"), data) | |
data = map(lambda doc: doc.get_text(), data) | |
data = filter(lambda doc: len(doc) > 0, data) | |
data = map(extract_words, data) | |
data = filter(lambda doc: len(doc) > 10, data) | |
data = list(data) | |
# Initilize transformer models and predict all of the | |
# document embeddings as computed by bert and friends | |
model = SentenceTransformer(model_name) | |
embeddings = model.encode(data, show_progress_bar=True) | |
# Fit TSNE model for embedding space. Sqush down to 2 | |
# dimentions for visualization purposes. | |
tsne = TSNE(n_components=2, random_state=2, init="pca", learning_rate="auto", perplexity=40) | |
tsne = tsne.fit_transform(embeddings) | |
# Hyperparameter optimizations | |
silhouettes = [] | |
outliers = [] | |
ch = [] | |
for eps in np.arange(0.001, 1, 0.001): | |
dbscan = DBSCAN(eps, metric="cosine", n_jobs=-1) | |
dbscan = dbscan.fit_predict(embeddings) | |
if len(np.unique(dbscan)) > 1: | |
silhouettes.append(silhouette_score(embeddings, dbscan, metric="cosine")) | |
ch.append(calinski_harabasz_score(embeddings, dbscan)) | |
else: | |
silhouettes.append(0) | |
ch.append(0) | |
outliers.append(len(dbscan[dbscan == -1])) | |
for p in range(15, 51): | |
best = np.argmax(silhouettes) | |
dbscan = DBSCAN(0.001 + 0.001 * best, metric="cosine", n_jobs=-1) | |
dbscan = dbscan.fit_predict(embeddings) | |
tsne = TSNE(n_components=2, perplexity=p, learning_rate="auto", init="pca", metric="cosine") | |
tsne = tsne.fit_transform(embeddings) | |
plt.figure() | |
plt.scatter(tsne[dbscan != -1][:, 0], tsne[dbscan != -1][:, 1], s=0.5, c=dbscan[dbscan != -1], cmap="hsv") | |
plt.scatter(tsne[dbscan == -1][:, 0], tsne[dbscan == -1][:, 1], s=0.5, c="#abb8c3") | |
plt.title(f"{display_name} Embeddings Visualized with T-SNE (p = {p})") | |
plt.savefig(f"{output_dir}/tnse_{p:02}.png", format="png", dpi=600) | |
plt.close() | |
plt.figure() | |
plt.plot(np.arange(0.001, 1, 0.001), silhouettes, lw=0.5, color="#dc322f") | |
plt.legend() | |
plt.xlabel("Epsilon") | |
plt.ylabel("silhouette score") | |
plt.title("Optimizing Epsilon by Silhouette Score") | |
plt.savefig(f"silhouettes.png", format="png", dpi=600) | |
plt.close() | |
plt.figure() | |
plt.plot(np.arange(0.001, 1, 0.001), outliers, lw=0.5, color="#dc322f") | |
plt.legend() | |
plt.xlabel("Epsilon") | |
plt.ylabel("outliers") | |
plt.title("Optimizing Epsilon by Number of Outliers") | |
plt.savefig(f"outliers.png", format="png", dpi=600) | |
plt.close() | |
plt.figure() | |
plt.plot(np.arange(0.001, 1, 0.001), ch, lw=0.5, color="#dc322f") | |
plt.legend() | |
plt.xlabel("Epsilon") | |
plt.ylabel("Calinski-Harabasz score") | |
plt.title("Optimizing Epsilon by Calinski-Harabasz Score") | |
plt.savefig(f"calinski-harabasz.png", format="png", dpi=600) | |
plt.close() |