Edit model card

jzju/sbert-sv-lim2

This model Is trained from KBLab/bert-base-swedish-cased-new with data from sbx/superlim-2

This is a sentence-transformers model: It maps sentences & paragraphs to a 256 dimensional dense vector space and can be used for tasks like clustering or semantic search.

Usage (Sentence-Transformers)

Using this model becomes easy when you have sentence-transformers installed:

pip install -U sentence-transformers

Then you can use the model like this:

from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('jzju/sbert-sv-lim2')
embeddings = model.encode(sentences)
print(embeddings)

Training Code

from datasets import load_dataset, concatenate_datasets
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses,
    models,
    util,
    datasets,
)
from torch.utils.data import DataLoader
from torch import nn
import random

word_embedding_model = models.Transformer(
    "KBLab/bert-base-swedish-cased-new", max_seq_length=256
)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(
    in_features=pooling_model.get_sentence_embedding_dimension(),
    out_features=256,
    activation_function=nn.Tanh(),
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])


def pair():
    def norm(x):
        x["label"] = x["label"] / m
        return x

    dd = []
    for sub in ["swepar", "swesim_relatedness", "swesim_similarity"]:
        ds = concatenate_datasets(
            [d for d in load_dataset("sbx/superlim-2", sub).values()]
        )
        if "sentence_1" in ds.features:
            ds = ds.rename_column("sentence_1", "d1")
            ds = ds.rename_column("sentence_2", "d2")
        else:
            ds = ds.rename_column("word_1", "d1")
            ds = ds.rename_column("word_2", "d2")
        m = max([d["label"] for d in ds])
        dd.append(ds.map(norm))
    ds = concatenate_datasets(dd)

    train_examples = []
    for d in ds:
        train_examples.append(InputExample(texts=[d["d1"], d["d2"]], label=d["label"]))
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
    train_loss = losses.CosineSimilarityLoss(model)
    model.fit(
        train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100
    )


def nli():
    ds = concatenate_datasets(
        [d for d in load_dataset("sbx/superlim-2", "swenli").values()]
    )

    def add_to_samples(sent1, sent2, label):
        if sent1 not in train_data:
            train_data[sent1] = {0: set(), 1: set(), 2: set()}
        train_data[sent1][label].add(sent2)

    train_data = {}
    for d in ds:
        add_to_samples(d["premise"], d["hypothesis"], d["label"])
        add_to_samples(d["hypothesis"], d["premise"], d["label"])

    train_samples = []
    for sent1, others in train_data.items():
        if len(others[0]) > 0 and len(others[1]) > 0:
            train_samples.append(
                InputExample(
                    texts=[
                        sent1,
                        random.choice(list(others[0])),
                        random.choice(list(others[1])),
                    ]
                )
            )
            train_samples.append(
                InputExample(
                    texts=[
                        random.choice(list(others[0])),
                        sent1,
                        random.choice(list(others[1])),
                    ]
                )
            )
    train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=64)
    train_loss = losses.MultipleNegativesRankingLoss(model)
    model.fit(
        train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100
    )


pair()
nli()
model.save()

Downloads last month
18
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train jzju/sbert-sv-lim2