Spaces:
Paused
Paused
from torch import nn | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix | |
def get_eval_metric(y_pred, y_test): | |
return { | |
'accuracy': accuracy_score(y_test, y_pred), | |
'precision': precision_score(y_test, y_pred, average='weighted'), | |
'recall': recall_score(y_test, y_pred, average='weighted'), | |
'f1': f1_score(y_test, y_pred, average='weighted'), | |
'confusion_mat': confusion_matrix(y_test, y_pred, normalize='true'), | |
} | |
class MLP(nn.Module): | |
def __init__(self, input_size=768, hidden_size=256, output_size=3, dropout_rate=.2, class_weights=None): | |
super(MLP, self).__init__() | |
self.class_weights = class_weights | |
self.activation = nn.ReLU() | |
self.bn1 = nn.BatchNorm1d(hidden_size) | |
self.dropout = nn.Dropout(dropout_rate) | |
self.fc1 = nn.Linear(input_size, hidden_size) | |
self.fc2 = nn.Linear(hidden_size, output_size) | |
def forward(self, x): | |
input_is_dict = False | |
if isinstance(x, dict): | |
assert "sentence_embedding" in x | |
input_is_dict = True | |
x = x['sentence_embedding'] | |
x = self.fc1(x) | |
x = self.bn1(x) | |
x = self.activation(x) | |
x = self.dropout(x) | |
x = self.fc2(x) | |
if input_is_dict: | |
return {'logits': x} | |
return x | |
def predict(self, x): | |
_, predicted = torch.max(self.forward(x), 1) | |
print('I am predict') | |
return predicted | |
def predict_proba(self, x): | |
print('I am predict_proba') | |
return self.forward(x) | |
def get_loss_fn(self): | |
return nn.CrossEntropyLoss(weight=self.class_weights, reduction='mean') | |
if __name__ == '__main__': | |
from setfit.__init__ import SetFitModel, Trainer, TrainingArguments | |
from datasets import Dataset, load_dataset, DatasetDict | |
from sentence_transformers import SentenceTransformer, models, util | |
from sentence_transformers.losses import BatchAllTripletLoss, BatchHardSoftMarginTripletLoss, BatchHardTripletLoss, BatchSemiHardTripletLoss | |
from sklearn.linear_model import LogisticRegression | |
import sys | |
import os | |
import warnings | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from datetime import datetime | |
import torch.optim as optim | |
from statistics import mean | |
from pprint import pprint | |
from torch.utils.data import DataLoader, TensorDataset | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from safetensors.torch import load_model, save_model | |
from itertools import chain | |
from time import perf_counter | |
from tqdm import trange | |
from collections import Counter | |
from sklearn.utils.class_weight import compute_class_weight | |
import numpy as np | |
import matplotlib.pyplot as plt | |
warnings.filterwarnings("ignore") | |
SEED = 1003200212 + 1 | |
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') | |
print(DEVICE) | |
start = perf_counter() | |
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) | |
dataset_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'financial_dataset')) | |
sys.path.append(dataset_dir) | |
from load_test_data import get_labels_df, get_texts | |
from train_classificator import plot_labels_distribution | |
def split_text(text, chunk_size=1200, chunk_overlap=200): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, chunk_overlap=chunk_overlap, | |
length_function = len, separators=[" ", ",", "\n"] | |
) | |
text_chunks = text_splitter.create_documents([text]) | |
return text_chunks | |
labels_dir = dataset_dir + '/csvs/' | |
df = get_labels_df(labels_dir) | |
texts_dir = dataset_dir + '/txts/' | |
texts = get_texts(texts_dir) | |
# df = df.iloc[[0, 13, 113], :] | |
# print(df.loc[:, 'Label']) | |
# texts = [texts[0]] + [texts[13]] + [texts[113]] | |
print(len(df), len(texts)) | |
print(mean(list(map(len, texts)))) | |
documents = [split_text(text, chunk_size=3_200, chunk_overlap=200) for text in texts] | |
docs_chunks = [[doc.page_content for doc in document] for document in documents] | |
# print([len(text_chunks)for text_chunks in docs_chunks]) | |
model = SentenceTransformer('financial-roberta') | |
model = model.to('cuda:0') | |
# # Get sentence embeddings for each text | |
doc_embeddings = [model.encode(doc_chunks, show_progress_bar=True).tolist() for doc_chunks in docs_chunks] | |
embeddings = [embedding for doc_embedding in doc_embeddings for embedding in doc_embedding] | |
texts = [text for doc_chunks in docs_chunks for text in doc_chunks] | |
labels = np.repeat(df['Label'], [len(document) for document in documents]).tolist() | |
# print(df.loc[:, 'Label']) | |
# print([len(text) for text in texts]) | |
# print([len(emb) for emb in embeddings]) | |
# print(labels) | |
dataset = Dataset.from_dict({ | |
'texts': texts, | |
'labels': labels, | |
'embeddings': embeddings, | |
}) | |
print(len(dataset['texts'])) | |
print(dataset['labels']) | |
dataset = dataset.class_encode_column('labels') | |
print(len(dataset)) | |
train_test_dataset = dataset.train_test_split(test_size=.2, stratify_by_column='labels') | |
val_test_dataset = train_test_dataset['test'].train_test_split(test_size=.5, stratify_by_column='labels') | |
dataset = DatasetDict({ | |
'train': train_test_dataset['train'], | |
'val': val_test_dataset['train'], | |
'test': val_test_dataset['test'] | |
} | |
) | |
plot_labels_distribution(dataset, save_as_filename='plots/finetuned_st_label_distr.png') | |
dataset.push_to_hub("CabraVC/vector_dataset_roberta-fine-tuned", private=True) | |