how to export onnx format?

#12
by nampham1106 - opened

I'm try export to onnx format but optimum not support new architecture. please help me

Alibaba-NLP org

Yes, the new GTE model architecture is not yet supported in Optimum. We currently do not have the bandwidth to push for the integration of the new GTE framework into Optimum, but we will address this issue later.

@thenlper Can you at least ensure that the model can be seamlessly exported using torch.onnx.export ? Compatibility with torch.script would also be super helpful. The GTE family are great models but it is very difficult to deploy at scale for practitioners.

Just to support @lulmer on this one, @thenlper . I'm loving GTE models, but there's no point in use them if I can't deploy them into production.

Convert to onnx:

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_original = AutoModelForTokenClassification.from_pretrained('Alibaba-NLP/gte-multilingual-base', trust_remote_code=True)
tokenizer_original = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-multilingual-base')

model_path = "gte-multilingual-base-onnx"

dummy_model_input = tokenizer_original("This is a test for ONNX Runtime!", return_tensors="pt")
torch.onnx.export(
    model_original, 
    tuple(dummy_model_input.values()),
    f"{model_path}/model.onnx", 
    input_names=['input_ids', 'attention_mask'], 
    output_names=['last_hidden_state'], 
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}, 
                  'attention_mask': {0: 'batch_size', 1: 'sequence'}, 
                  'last_hidden_state': {0: 'batch_size', 1: 'sequence'}
                  }, 
    do_constant_folding=True, 
    opset_version=14, 
)

#let's also save the tokenizer

tokenizer_original.save_pretrained(f"{model_path}/tokenizer")

Now for inference:


import onnxruntime as ort
import numpy as np
from tokenizers import Tokenizer
import os
from typing import Dict, Tuple

class GTEOnnxModel:
    def __init__(self, model_path: str, qt: str = None):
        """
        Onnx model for GTE

        Parameters:
        model_path: str - path to the model folder
        """
        opt = ort.SessionOptions()
        opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
        opt.log_severity_level = 3
        opt.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL

        self.sess = ort.InferenceSession(os.path.join(model_path, "model.onnx"), opt, providers=["CPUExecutionProvider"])
        self.tokenizer = Tokenizer.from_file(os.path.join(model_path, "tokenizer/tokenizer.json"))


    def encode(self, text: str) -> Tuple[np.ndarray, Dict[str, float], np.ndarray]:
        encoded_inputs = self.tokenizer.encode(text, add_special_tokens=True)
        encoded_inputs = {'input_ids': encoded_inputs.ids, 'attention_mask': encoded_inputs.attention_mask}
        model_input = {name: np.atleast_2d(value) for name, value in encoded_inputs.items()}
        emb = self.sess.run(None, model_input)

        # get the CLS embedding and normalize it
        dense = emb[1][0,0]
        dense = dense / np.linalg.norm(dense)

        # get the token weights
        tk_weights = emb[0][0,:,0][1:-1]
        token_weights = {tk_id:tk_weights[i] for i, tk_id in enumerate(encoded_inputs['input_ids'][1:-1])}

        return dense,token_weights



model_onnx = GTEOnnxModel("gte-multilingual-base-onnx")
emb, weights = model_onnx.encode("This is a test for ONNX Runtime!")

Sign up or log in to comment