|
import spacy |
|
from spacy.training import Example |
|
import jsonlines |
|
import random |
|
|
|
|
|
nlp = spacy.blank("en") |
|
|
|
|
|
textcat = nlp.add_pipe('textcat_multilabel', last=True) |
|
textcat.add_label("CapitalRequirements") |
|
textcat.add_label("ConsumerProtection") |
|
textcat.add_label("RiskManagement") |
|
textcat.add_label("ReportingAndCompliance") |
|
textcat.add_label("CorporateGovernance") |
|
|
|
|
|
processed_data_file = "data/firstStep_file.jsonl" |
|
|
|
|
|
with jsonlines.open(processed_data_file) as reader: |
|
processed_data = list(reader) |
|
|
|
|
|
spacy_train_data = [] |
|
for obj in processed_data: |
|
text = obj["text"] |
|
label = { |
|
"CapitalRequirements": obj["label"] == "CapitalRequirements", |
|
"ConsumerProtection": obj["label"] == "ConsumerProtection", |
|
"RiskManagement": obj["label"] == "RiskManagement", |
|
"ReportingAndCompliance": obj["label"] == "ReportingAndCompliance", |
|
"CorporateGovernance": obj["label"] == "CorporateGovernance" |
|
} |
|
spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label})) |
|
|
|
|
|
optimizer = nlp.initialize() |
|
|
|
|
|
n_iter = 10 |
|
for i in range(n_iter): |
|
spacy.util.fix_random_seed(1) |
|
random.shuffle(spacy_train_data) |
|
losses = {} |
|
for batch in spacy.util.minibatch(spacy_train_data, size=8): |
|
nlp.update(batch, losses=losses, sgd=optimizer) |
|
print("Iteration:", i, "Losses:", losses) |
|
|
|
|
|
output_dir = "./my_trained_model" |
|
nlp.to_disk(output_dir) |
|
|