metadata

language:
  - en
library_name: sentence-transformers
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - dataset_size:1K<n<10K
  - loss:MatryoshkaLoss
  - loss:CoSENTLoss
base_model: distilbert/distilbert-base-uncased
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
widget:
  - source_sentence: A plane in the sky.
    sentences:
      - Two airplanes in the sky.
      - Two women are sitting in a cafe.
      - Turkey's PM Warns Against Protests
  - source_sentence: A man jumping rope
    sentences:
      - A man climbs a rope.
      - Blast on Indian train kills one
      - Israel expands subsidies to settlements
  - source_sentence: A baby is laughing.
    sentences:
      - The baby laughed in his car seat.
      - The girl is playing the guitar.
      - Bangladesh Islamist leader executed
  - source_sentence: A plane is landing.
    sentences:
      - A animated airplane is landing.
      - A man plays an acoustic guitar.
      - Obama urges no new sanctions on Iran
  - source_sentence: A boy is vacuuming.
    sentences:
      - A little boy is vacuuming the floor.
      - Suicide bomber strikes in Syria
      - 32 die in Bangladesh protest
pipeline_tag: sentence-similarity
model-index:
  - name: SentenceTransformer based on distilbert/distilbert-base-uncased
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 768
          type: sts-dev-768
        metrics:
          - type: pearson_cosine
            value: 0.8580007118837358
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.871820299536176
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8579597824452743
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8611676230134329
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8584693242993966
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8617539394714434
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6259192943899555
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6245849846631494
            name: Spearman Dot
          - type: pearson_max
            value: 0.8584693242993966
            name: Pearson Max
          - type: spearman_max
            value: 0.871820299536176
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 512
          type: sts-dev-512
        metrics:
          - type: pearson_cosine
            value: 0.855328467168775
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8708546925464771
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8571701704416792
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8609603329646862
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8577665956034857
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8611867637483455
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.6301839390729895
            name: Pearson Dot
          - type: spearman_dot
            value: 0.6312551259723912
            name: Spearman Dot
          - type: pearson_max
            value: 0.8577665956034857
            name: Pearson Max
          - type: spearman_max
            value: 0.8708546925464771
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 256
          type: sts-dev-256
        metrics:
          - type: pearson_cosine
            value: 0.8534192140857989
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8684742287834586
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8550376893582918
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8595873940460774
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.855243500036296
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8595389790366662
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5692600956239565
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5631798664802073
            name: Spearman Dot
          - type: pearson_max
            value: 0.855243500036296
            name: Pearson Max
          - type: spearman_max
            value: 0.8684742287834586
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 128
          type: sts-dev-128
        metrics:
          - type: pearson_cosine
            value: 0.8437376978373121
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8634082420330794
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8454596574177755
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.85188111210432
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8479887421152008
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8537259447832961
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5513203019384504
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5500687993669725
            name: Spearman Dot
          - type: pearson_max
            value: 0.8479887421152008
            name: Pearson Max
          - type: spearman_max
            value: 0.8634082420330794
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 64
          type: sts-dev-64
        metrics:
          - type: pearson_cosine
            value: 0.8272184719216283
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8541030591238341
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8307462071466211
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8406982840852595
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8342382781891662
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8427338906559259
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.494520518114596
            name: Pearson Dot
          - type: spearman_dot
            value: 0.49218360841938574
            name: Spearman Dot
          - type: pearson_max
            value: 0.8342382781891662
            name: Pearson Max
          - type: spearman_max
            value: 0.8541030591238341
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 32
          type: sts-dev-32
        metrics:
          - type: pearson_cosine
            value: 0.795037446434113
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8337679875014413
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8120635303724889
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8249212312847407
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8157607542813738
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8262833782950811
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.44442829473227297
            name: Pearson Dot
          - type: spearman_dot
            value: 0.4333209339301445
            name: Spearman Dot
          - type: pearson_max
            value: 0.8157607542813738
            name: Pearson Max
          - type: spearman_max
            value: 0.8337679875014413
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts dev 16
          type: sts-dev-16
        metrics:
          - type: pearson_cosine
            value: 0.7402920507586056
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.7953398971914366
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.7661819958789702
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.7806209887724272
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.7753319460863385
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.788448392758016
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.2914268467178465
            name: Pearson Dot
          - type: spearman_dot
            value: 0.2731801701260987
            name: Spearman Dot
          - type: pearson_max
            value: 0.7753319460863385
            name: Pearson Max
          - type: spearman_max
            value: 0.7953398971914366
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 768
          type: sts-test-768
        metrics:
          - type: pearson_cosine
            value: 0.8355126555886146
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8474343771835785
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8477769261693708
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8440487632905719
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8482353907773731
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8443357402859023
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.575155372226532
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5645826036063977
            name: Spearman Dot
          - type: pearson_max
            value: 0.8482353907773731
            name: Pearson Max
          - type: spearman_max
            value: 0.8474343771835785
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 512
          type: sts-test-512
        metrics:
          - type: pearson_cosine
            value: 0.8345636179092932
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.847969741682177
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8471375569231226
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8432315278152519
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8475673449165414
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8438566473590643
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5890647647307824
            name: Pearson Dot
          - type: spearman_dot
            value: 0.579599198660516
            name: Spearman Dot
          - type: pearson_max
            value: 0.8475673449165414
            name: Pearson Max
          - type: spearman_max
            value: 0.847969741682177
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 256
          type: sts-test-256
        metrics:
          - type: pearson_cosine
            value: 0.8264268046184008
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8414784020776254
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8414377075419083
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8388634084489552
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8423455168447094
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8400797815114284
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.5229860109488433
            name: Pearson Dot
          - type: spearman_dot
            value: 0.5099269577284724
            name: Spearman Dot
          - type: pearson_max
            value: 0.8423455168447094
            name: Pearson Max
          - type: spearman_max
            value: 0.8414784020776254
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 128
          type: sts-test-128
        metrics:
          - type: pearson_cosine
            value: 0.8189773000477083
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.837625236881656
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8349887918183595
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8336489133404312
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8365085956274743
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8347627903646608
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.49799738412782535
            name: Pearson Dot
          - type: spearman_dot
            value: 0.48970409354637134
            name: Spearman Dot
          - type: pearson_max
            value: 0.8365085956274743
            name: Pearson Max
          - type: spearman_max
            value: 0.837625236881656
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 64
          type: sts-test-64
        metrics:
          - type: pearson_cosine
            value: 0.8062259318483077
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.8292433269349447
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8236527010227455
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8243846152203906
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8273451113428331
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8269777736926925
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.4318247709105578
            name: Pearson Dot
          - type: spearman_dot
            value: 0.4325030690630689
            name: Spearman Dot
          - type: pearson_max
            value: 0.8273451113428331
            name: Pearson Max
          - type: spearman_max
            value: 0.8292433269349447
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 32
          type: sts-test-32
        metrics:
          - type: pearson_cosine
            value: 0.7769698706658718
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.813231133965274
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.8040659399939705
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.8083901845044422
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.8089540323890078
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.8126434700070444
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.3721968691924307
            name: Pearson Dot
          - type: spearman_dot
            value: 0.36359211044547146
            name: Spearman Dot
          - type: pearson_max
            value: 0.8089540323890078
            name: Pearson Max
          - type: spearman_max
            value: 0.813231133965274
            name: Spearman Max
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test 16
          type: sts-test-16
        metrics:
          - type: pearson_cosine
            value: 0.7350580362911046
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.7811480253828886
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.7686995805327835
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.7767016091591996
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.7732639293607727
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.7798783495241994
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.25479413300114095
            name: Pearson Dot
          - type: spearman_dot
            value: 0.24117846955339683
            name: Spearman Dot
          - type: pearson_max
            value: 0.7732639293607727
            name: Pearson Max
          - type: spearman_max
            value: 0.7811480253828886
            name: Spearman Max

SentenceTransformer based on distilbert/distilbert-base-uncased

This is a sentence-transformers model finetuned from distilbert/distilbert-base-uncased on the sentence-transformers/stsb dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Type: Sentence Transformer
Base model: distilbert/distilbert-base-uncased
Maximum Sequence Length: 512 tokens
Output Dimensionality: 768 tokens
Similarity Function: Cosine Similarity
Training Dataset:
- sentence-transformers/stsb
Language: en

Model Sources

Documentation: Sentence Transformers Documentation
Repository: Sentence Transformers on GitHub
Hugging Face: Sentence Transformers on Hugging Face

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("mrm8488/distilbert-base-matryoshka-sts-v2")
# Run inference
sentences = [
    'A boy is vacuuming.',
    'A little boy is vacuuming the floor.',
    'Suicide bomber strikes in Syria',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Semantic Similarity

Dataset: sts-dev-768
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.858
spearman_cosine	0.8718
pearson_manhattan	0.858
spearman_manhattan	0.8612
pearson_euclidean	0.8585
spearman_euclidean	0.8618
pearson_dot	0.6259
spearman_dot	0.6246
pearson_max	0.8585
spearman_max	0.8718

Semantic Similarity

Dataset: sts-dev-512
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8553
spearman_cosine	0.8709
pearson_manhattan	0.8572
spearman_manhattan	0.861
pearson_euclidean	0.8578
spearman_euclidean	0.8612
pearson_dot	0.6302
spearman_dot	0.6313
pearson_max	0.8578
spearman_max	0.8709

Semantic Similarity

Dataset: sts-dev-256
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8534
spearman_cosine	0.8685
pearson_manhattan	0.855
spearman_manhattan	0.8596
pearson_euclidean	0.8552
spearman_euclidean	0.8595
pearson_dot	0.5693
spearman_dot	0.5632
pearson_max	0.8552
spearman_max	0.8685

Semantic Similarity

Dataset: sts-dev-128
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8437
spearman_cosine	0.8634
pearson_manhattan	0.8455
spearman_manhattan	0.8519
pearson_euclidean	0.848
spearman_euclidean	0.8537
pearson_dot	0.5513
spearman_dot	0.5501
pearson_max	0.848
spearman_max	0.8634

Semantic Similarity

Dataset: sts-dev-64
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8272
spearman_cosine	0.8541
pearson_manhattan	0.8307
spearman_manhattan	0.8407
pearson_euclidean	0.8342
spearman_euclidean	0.8427
pearson_dot	0.4945
spearman_dot	0.4922
pearson_max	0.8342
spearman_max	0.8541

Semantic Similarity

Dataset: sts-dev-32
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.795
spearman_cosine	0.8338
pearson_manhattan	0.8121
spearman_manhattan	0.8249
pearson_euclidean	0.8158
spearman_euclidean	0.8263
pearson_dot	0.4444
spearman_dot	0.4333
pearson_max	0.8158
spearman_max	0.8338

Semantic Similarity

Dataset: sts-dev-16
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.7403
spearman_cosine	0.7953
pearson_manhattan	0.7662
spearman_manhattan	0.7806
pearson_euclidean	0.7753
spearman_euclidean	0.7884
pearson_dot	0.2914
spearman_dot	0.2732
pearson_max	0.7753
spearman_max	0.7953

Semantic Similarity

Dataset: sts-test-768
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8355
spearman_cosine	0.8474
pearson_manhattan	0.8478
spearman_manhattan	0.844
pearson_euclidean	0.8482
spearman_euclidean	0.8443
pearson_dot	0.5752
spearman_dot	0.5646
pearson_max	0.8482
spearman_max	0.8474

Semantic Similarity

Dataset: sts-test-512
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8346
spearman_cosine	0.848
pearson_manhattan	0.8471
spearman_manhattan	0.8432
pearson_euclidean	0.8476
spearman_euclidean	0.8439
pearson_dot	0.5891
spearman_dot	0.5796
pearson_max	0.8476
spearman_max	0.848

Semantic Similarity

Dataset: sts-test-256
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8264
spearman_cosine	0.8415
pearson_manhattan	0.8414
spearman_manhattan	0.8389
pearson_euclidean	0.8423
spearman_euclidean	0.8401
pearson_dot	0.523
spearman_dot	0.5099
pearson_max	0.8423
spearman_max	0.8415

Semantic Similarity

Dataset: sts-test-128
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.819
spearman_cosine	0.8376
pearson_manhattan	0.835
spearman_manhattan	0.8336
pearson_euclidean	0.8365
spearman_euclidean	0.8348
pearson_dot	0.498
spearman_dot	0.4897
pearson_max	0.8365
spearman_max	0.8376

Semantic Similarity

Dataset: sts-test-64
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.8062
spearman_cosine	0.8292
pearson_manhattan	0.8237
spearman_manhattan	0.8244
pearson_euclidean	0.8273
spearman_euclidean	0.827
pearson_dot	0.4318
spearman_dot	0.4325
pearson_max	0.8273
spearman_max	0.8292

Semantic Similarity

Dataset: sts-test-32
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.777
spearman_cosine	0.8132
pearson_manhattan	0.8041
spearman_manhattan	0.8084
pearson_euclidean	0.809
spearman_euclidean	0.8126
pearson_dot	0.3722
spearman_dot	0.3636
pearson_max	0.809
spearman_max	0.8132

Semantic Similarity

Dataset: sts-test-16
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.7351
spearman_cosine	0.7811
pearson_manhattan	0.7687
spearman_manhattan	0.7767
pearson_euclidean	0.7733
spearman_euclidean	0.7799
pearson_dot	0.2548
spearman_dot	0.2412
pearson_max	0.7733
spearman_max	0.7811

Training Details

Training Dataset

sentence-transformers/stsb

Dataset: sentence-transformers/stsb at ab7a5ac
Size: 5,749 training samples
Columns: sentence1, sentence2, and score
Approximate statistics based on the first 1000 samples:
sentence1 sentence2 score
type string string float
details
min: 6 tokens
mean: 10.0 tokens
max: 28 tokens

min: 5 tokens
mean: 9.95 tokens
max: 25 tokens

min: 0.0
mean: 0.54
max: 1.0

	sentence1	sentence2	score
type	string	string	float
details	min: 6 tokens mean: 10.0 tokens max: 28 tokens	min: 5 tokens mean: 9.95 tokens max: 25 tokens	min: 0.0 mean: 0.54 max: 1.0

Samples:

sentence1	sentence2	score
`A plane is taking off.`	`An air plane is taking off.`	`1.0`
`A man is playing a large flute.`	`A man is playing a flute.`	`0.76`
`A man is spreading shreded cheese on a pizza.`	`A man is spreading shredded cheese on an uncooked pizza.`	`0.76`

Loss: MatryoshkaLoss with these parameters:

{
    "loss": "CoSENTLoss",
    "matryoshka_dims": [
        768,
        512,
        256,
        128,
        64,
        32,
        16
    ],
    "matryoshka_weights": [
        1,
        1,
        1,
        1,
        1,
        1,
        1
    ],
    "n_dims_per_step": -1
}

Evaluation Dataset

sentence-transformers/stsb

Dataset: sentence-transformers/stsb at ab7a5ac
Size: 1,500 evaluation samples
Columns: sentence1, sentence2, and score
Approximate statistics based on the first 1000 samples:
sentence1 sentence2 score
type string string float
details
min: 5 tokens
mean: 15.1 tokens
max: 45 tokens

min: 6 tokens
mean: 15.11 tokens
max: 53 tokens

min: 0.0
mean: 0.47
max: 1.0

	sentence1	sentence2	score
type	string	string	float
details	min: 5 tokens mean: 15.1 tokens max: 45 tokens	min: 6 tokens mean: 15.11 tokens max: 53 tokens	min: 0.0 mean: 0.47 max: 1.0

Samples:

sentence1	sentence2	score
`A man with a hard hat is dancing.`	`A man wearing a hard hat is dancing.`	`1.0`
`A young child is riding a horse.`	`A child is riding a horse.`	`0.95`
`A man is feeding a mouse to a snake.`	`The man is feeding a mouse to the snake.`	`1.0`

Loss: MatryoshkaLoss with these parameters:

{
    "loss": "CoSENTLoss",
    "matryoshka_dims": [
        768,
        512,
        256,
        128,
        64,
        32,
        16
    ],
    "matryoshka_weights": [
        1,
        1,
        1,
        1,
        1,
        1,
        1
    ],
    "n_dims_per_step": -1
}

Training Hyperparameters

Non-Default Hyperparameters

eval_strategy: steps
per_device_train_batch_size: 128
per_device_eval_batch_size: 128
num_train_epochs: 4
warmup_ratio: 0.1
bf16: True

All Hyperparameters

Click to expand

overwrite_output_dir: False
do_predict: False
eval_strategy: steps
prediction_loss_only: True
per_device_train_batch_size: 128
per_device_eval_batch_size: 128
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 1
eval_accumulation_steps: None
learning_rate: 5e-05
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 4
max_steps: -1
lr_scheduler_type: linear
lr_scheduler_kwargs: {}
warmup_ratio: 0.1
warmup_steps: 0
log_level: passive
log_level_replica: warning
log_on_each_node: True
logging_nan_inf_filter: True
save_safetensors: True
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
use_mps_device: False
seed: 42
data_seed: None
jit_mode_eval: False
use_ipex: False
bf16: True
fp16: False
fp16_opt_level: O1
half_precision_backend: auto
bf16_full_eval: False
fp16_full_eval: False
tf32: None
local_rank: 0
ddp_backend: None
tpu_num_cores: None
tpu_metrics_debug: False
debug: []
dataloader_drop_last: False
dataloader_num_workers: 0
dataloader_prefetch_factor: None
past_index: -1
disable_tqdm: False
remove_unused_columns: True
label_names: None
load_best_model_at_end: False
ignore_data_skip: False
fsdp: []
fsdp_min_num_params: 0
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap: None
accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed: None
label_smoothing_factor: 0.0
optim: adamw_torch
optim_args: None
adafactor: False
group_by_length: False
length_column_name: length
ddp_find_unused_parameters: None
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
dataloader_pin_memory: True
dataloader_persistent_workers: False
skip_memory_metrics: True
use_legacy_prediction_loop: False
push_to_hub: False
resume_from_checkpoint: None
hub_model_id: None
hub_strategy: every_save
hub_private_repo: False
hub_always_push: False
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
include_inputs_for_metrics: False
eval_do_concat_batches: True
fp16_backend: auto
push_to_hub_model_id: None
push_to_hub_organization: None
mp_parameters:
auto_find_batch_size: False
full_determinism: False
torchdynamo: None
ray_scope: last
ddp_timeout: 1800
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
dispatch_batches: None
split_batches: None
include_tokens_per_second: False
include_num_input_tokens_seen: False
neftune_noise_alpha: None
optim_target_modules: None
batch_eval_metrics: False
batch_sampler: batch_sampler
multi_dataset_batch_sampler: proportional

Training Logs

Epoch	Step	Training Loss	loss	sts-dev-128_spearman_cosine	sts-dev-16_spearman_cosine	sts-dev-256_spearman_cosine	sts-dev-32_spearman_cosine	sts-dev-512_spearman_cosine	sts-dev-64_spearman_cosine	sts-dev-768_spearman_cosine	sts-test-128_spearman_cosine	sts-test-16_spearman_cosine	sts-test-256_spearman_cosine	sts-test-32_spearman_cosine	sts-test-512_spearman_cosine	sts-test-64_spearman_cosine	sts-test-768_spearman_cosine
2.2222	100	60.4066	60.8718	0.8634	0.7953	0.8685	0.8338	0.8709	0.8541	0.8718	-	-	-	-	-	-	-
4.0	180	-	-	-	-	-	-	-	-	-	0.8376	0.7811	0.8415	0.8132	0.8480	0.8292	0.8474

Framework Versions

Python: 3.10.12
Sentence Transformers: 3.0.0
Transformers: 4.41.1
PyTorch: 2.3.0+cu121
Accelerate: 0.30.1
Datasets: 2.19.1
Tokenizers: 0.19.1

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MatryoshkaLoss

@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

CoSENTLoss

@online{kexuefm-8847,
    title={CoSENT: A more efficient sentence vector scheme than Sentence-BERT},
    author={Su Jianlin},
    year={2022},
    month={Jan},
    url={https://kexue.fm/archives/8847},
}