CoreML Conversion of the mxbai-embed-large-v1 sentence embedding model

After extensive testing (and a lot of debugging with ChatGPT), I was able to convert the mxbai-embed-large-v1 model to CoreML and run it mostly on the GPU.

import torch
from transformers import AutoModel, AutoTokenizer
import coremltools as ct

# Define a wrapper class for the AutoModel to return only the last_hidden_state
class ModelWrapper(torch.nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        # Extract the 'last_hidden_state' from the model output
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return output.last_hidden_state  # or use 'pooler_output' if needed

# Load your SentenceTransformer model and tokenizer
model_name = "mixedbread-ai/mxbai-embed-large-v1"  # Replace with your model
model = AutoModel.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Wrap the model to return only the tensor output
wrapped_model = ModelWrapper(model)
wrapped_model.eval()

# Sample input to export the model
dummy_input = tokenizer("This is a sample input", return_tensors="pt")

# Trace the model using tensor inputs (input_ids, attention_mask)
traced_model = torch.jit.trace(wrapped_model, (dummy_input['input_ids'], dummy_input['attention_mask']))

# Convert the traced PyTorch model to CoreML using the ML Program format
model_from_torch = ct.convert(
    traced_model,
    inputs=[
        ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32),
        ct.TensorType(name="attention_mask", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32)
    ],
    minimum_deployment_target=ct.target.iOS17,
    convert_to="mlprogram",
    compute_precision=ct.precision.FLOAT16
)

# Save the CoreML model as an mlpackage
model_from_torch.save("mxbai-embed-large-v1.mlpackage")

It can be run like this:

import coremltools as ct
from transformers import AutoTokenizer
import numpy as np

# Load the CoreML model
model = ct.models.MLModel("mxbai-embed-large-v1.mlpackage")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")

# Prepare some input text
input_text = "This is a test sentence for the CoreML model"
inputs = tokenizer(input_text, return_tensors="np", padding=True, truncation=True, max_length=512)

# Extract input tensors
input_ids = inputs['input_ids'].astype(np.float32)  # CoreML expects float32
attention_mask = inputs['attention_mask'].astype(np.float32)

# Prepare inputs for the CoreML model
coreml_input = {"input_ids": input_ids, "attention_mask": attention_mask}

predictions = model.predict(coreml_input)

hidden_states = predictions['hidden_states']
cls_embedding = hidden_states[0, 0, :]
np.set_printoptions(threshold=np.inf)

# Print the CLS token embedding, which is a 1024-dimensional vector
print("CLS Token Embedding:", cls_embedding, len(cls_embedding))

I verified the output with ollama:

curl http://localhost:11434/api/embeddings -d '{
    "model": "mxbai-embed-large",
        "prompt": "This is a test sentence for the CoreML model"
    }'

Environment: Python 3.11 coremltools 8.0 sentence-transformers 3.1.0 transformers 4.44.2