import io |
import shutil |
import requests |
import json |
import time |
import torch |
import orjson |
import zipfile |
import torch.nn.functional as F |
from torch.utils.data import Dataset, DataLoader |
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, BertConfig |
from transformers import AutoTokenizer, AutoModelForTokenClassification |
API_URL = "http://dockerbase.duo:8000" |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
def load_data(): |
res = requests.post( |
API_URL + "/v1/auth/login/", |
json={"username": "admin", "password": "123"} |
) |
token = res.json()["key"] |
res = requests.post(API_URL + "/v1/projects/1/download", |
json={"format":"JSONL","exportApproved": True}, |
headers={"Authorization": "Token " + token} |
) |
task_id = res.json()["task_id"] |
ready = False |
print("Waiting for export task to be ready.", end="") |
while not ready: |
res = requests.get( |
API_URL + "/v1/tasks/status/" + str(task_id), |
headers={"Authorization": "Token " + token} |
) |
ready = res.json()["ready"] |
if not ready: |
time.sleep(1) |
print(".", end="") |
print("") |
res = requests.get( |
API_URL + f"/v1/projects/{PROJECT_ID}/download", |
params={"taskId": task_id}, |
headers={"Authorization": "Token " + token} |
) |
zip_file = io.BytesIO(res.content) |
with zipfile.ZipFile(zip_file, "r") as zip_ref: |
data = zip_ref.read("admin.jsonl").decode("utf-8") |
res = requests.get( |
API_URL + f"/v1/projects/{PROJECT_ID}/span-types", |
headers={"Authorization": "Token " + token} |
) |
labels = res.json() |
return labels, [orjson.loads(line) for line in data.split("\n") if line] |
labels, data = load_data() |
label_to_id = {} |
for i, label in enumerate(labels): |
label_to_id["B-" + label["text"]] = i * 2 + 1 |
label_to_id["I-" + label["text"]] = i * 2 + 2 |
label_to_id["O"] = 0 |
id_to_label = {v: k for k, v in label_to_id.items()} |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased") |
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=len(label_to_id)).to(device) |
model.config.id2label = id_to_label |
model.config.label2id = label_to_id |
from datasets import DatasetDict, Dataset |
def preprocess_data(item, tokenizer, label_to_id): |
text = item['text'] |
inputs = tokenizer( |
text, |
return_offsets_mapping=True, |
return_tensors="pt", |
truncation=True, |
padding='max_length', |
max_length=128, |
) |
input_ids = inputs["input_ids"] |
attention_mask = inputs["attention_mask"] |
offset_mapping = inputs["offset_mapping"] |
labels = ["O"] * 128 |
last_label = "O" |
for token_idx, [off_start, off_end] in enumerate(offset_mapping[0]): |
if off_start == off_end: |
continue |
for start, end, label in item['label']: |
if start <= off_start and off_end <= end: |
if last_label == label: |
labels[token_idx] = "I-" + label |
else: |
labels[token_idx] = "B-" + label |
last_label = label |
break |
labels = [label_to_id[label] for label in labels] |
return { |
"input_ids": input_ids.flatten(), |
"attention_mask": attention_mask.flatten(), |
"labels": labels, |
} |
class AddressDataset(Dataset): |
def __init__(self, dataset): |
self.dataset = dataset |
def __len__(self): |
return len(self.dataset) |
def __getitem__(self, index): |
item = self.dataset[index] |
return {key: torch.tensor(val) for key, val in item.items()} |
dataset = Dataset.from_generator( |
lambda: (preprocess_data(item, tokenizer, label_to_id) for item in data), |
) |
dataset = dataset.train_test_split(test_size=0.2) |
dataset = DatasetDict({ |
"train": dataset["train"], |
"test": dataset["test"] |
}) |
training_args = TrainingArguments( |
output_dir="./results", |
num_train_epochs=35, |
per_device_train_batch_size=32, |
per_device_eval_batch_size=32, |
logging_strategy="epoch", |
) |
from sklearn.preprocessing import MultiLabelBinarizer |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score |
def compute_metrics(pred, id_to_label): |
labels = pred.label_ids |
preds = pred.predictions.argmax(-1) |
labels = [[id_to_label[label_id] for label_id in label_ids] for label_ids in labels] |
preds = [[id_to_label[pred] for pred in preds] for preds in preds] |
labels = [label for label in labels if label != "O"] |
preds = [pred for pred in preds if pred != "O"] |
mlb = MultiLabelBinarizer() |
mlb.fit([id_to_label.values()]) |
labels = mlb.transform(labels) |
preds = mlb.transform(preds) |
return { |
"accuracy": accuracy_score(labels, preds), |
"precision": precision_score(labels, preds, average="micro"), |
"recall": recall_score(labels, preds, average="micro"), |
"f1": f1_score(labels, preds, average="micro"), |
} |
trainer = Trainer( |
model=model, |
args=training_args, |
train_dataset=dataset["train"], |
eval_dataset=dataset["test"], |
tokenizer=tokenizer, |
compute_metrics=lambda p: compute_metrics(p, id_to_label), |
) |
trainer.train() |
trainer.evaluate() |
trainer.save_model("./model") |
shutil.copy("./model/config.json", "./config.json") |