metadata
license: apache-2.0
library_name: transformers.js
base_model: Qwen/Qwen2-VL-2B-Instruct
https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct with ONNX weights to be compatible with Transformers.js.
Usage (Transformers.js)
If you haven't already, you can install the Transformers.js JavaScript library from NPM using:
npm i @huggingface/transformers
Example: Image+text to text
import { AutoProcessor, Qwen2VLForConditionalGeneration, RawImage } from "@huggingface/transformers";
// Load processor and model
const model_id = "onnx-community/Qwen2-VL-2B-Instruct";
const processor = await AutoProcessor.from_pretrained(model_id);
const model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id);
// Prepare inputs
const url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg";
const image = await (await RawImage.read(url)).resize(448, 448);
const conversation = [
{
role: "user",
content: [
{ type: "image" },
{ type: "text", text: "Describe this image." },
],
},
];
const text = processor.apply_chat_template(conversation, { add_generation_prompt: true });
const inputs = await processor(text, image);
// Perform inference
const outputs = await model.generate({
...inputs,
max_new_tokens: 128,
});
// Decode output
const decoded = processor.batch_decode(
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true },
);
console.log(decoded[0]);
// The image depicts a serene beach scene with a woman and a dog. The woman is sitting on the sand, wearing a plaid shirt, and appears to be engaged in a playful interaction with the dog. The dog, which is a large breed, is sitting on its hind legs and appears to be reaching out to the woman, possibly to give her a high-five or a paw. The background shows the ocean with gentle waves, and the sky is clear, suggesting it might be either sunrise or sunset. The overall atmosphere is calm and relaxed, capturing a moment of connection between the woman and the dog.
ONNX conversion script:
First, install the following dependencies:
pip install --upgrade git+https://github.com/huggingface/transformers.git onnx==1.17.0 onnxruntime==1.20.1 optimum==1.23.3 onnxslim==0.1.42
import os
import torch
from transformers import (
AutoProcessor,
Qwen2VLForConditionalGeneration,
DynamicCache,
)
class PatchedQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
def forward(self, *args):
inputs_embeds, attention_mask, position_ids, *past_key_values_args = args
# Convert past_key_values list to DynamicCache
if len(past_key_values_args) == 0:
past_key_values = None
else:
past_key_values = DynamicCache(self.config.num_hidden_layers)
for i in range(self.config.num_hidden_layers):
key = past_key_values_args.pop(0)
value = past_key_values_args.pop(0)
past_key_values.update(key_states=key, value_states=value, layer_idx=i)
o = super().forward(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
)
flattened_past_key_values_outputs = {
"logits": o.logits,
}
output_past_key_values: DynamicCache = o.past_key_values
for i, (key, value) in enumerate(
zip(output_past_key_values.key_cache, output_past_key_values.value_cache)
):
flattened_past_key_values_outputs[f"present.{i}.key"] = key
flattened_past_key_values_outputs[f"present.{i}.value"] = value
return flattened_past_key_values_outputs
# Constants
OUTPUT_FOLDER = "output"
EMBEDDING_MODEL_NAME = "embed_tokens.onnx"
TEXT_MODEL_NAME = "decoder_model_merged.onnx"
VISION_MODEL_NAME = "vision_encoder.onnx"
TEMP_MODEL_OUTPUT_FOLDER = os.path.join(OUTPUT_FOLDER, "temp")
FINAL_MODEL_OUTPUT_FOLDER = os.path.join(OUTPUT_FOLDER, "onnx")
# Load model and processor
model_id = "Qwen/Qwen2-VL-2B-Instruct"
model = PatchedQwen2VLForConditionalGeneration.from_pretrained(model_id).eval()
processor = AutoProcessor.from_pretrained(model_id)
# Save model configs and processor
model.config.save_pretrained(OUTPUT_FOLDER)
model.generation_config.save_pretrained(OUTPUT_FOLDER)
processor.save_pretrained(OUTPUT_FOLDER)
os.makedirs(TEMP_MODEL_OUTPUT_FOLDER, exist_ok=True)
# Configuration values
## Text model
text_config = model.config
num_heads = text_config.num_attention_heads
num_key_value_heads = text_config.num_key_value_heads
head_dim = text_config.hidden_size // num_heads
num_layers = text_config.num_hidden_layers
hidden_size = text_config.hidden_size
## Vision model
vision_config = model.config.vision_config
channel = vision_config.in_chans
temporal_patch_size = vision_config.temporal_patch_size
patch_size = vision_config.spatial_patch_size
# Dummy input sizes
grid_t, grid_h, grid_w = [1, 16, 16]
batch_size = 1
sequence_length = 16
num_channels = 3
past_sequence_length = 0
image_batch_size = 1 # TODO: Add support for > 1 images
assert image_batch_size == 1
# Dummy inputs
## Embedding inputs
input_ids = torch.randint(
0, model.config.vocab_size, (batch_size, sequence_length), dtype=torch.int64
)
## Text inputs
dummy_past_key_values_kwargs = {
f"past_key_values.{i}.{key}": torch.zeros(
batch_size,
num_key_value_heads,
past_sequence_length,
head_dim,
dtype=torch.float32,
)
for i in range(num_layers)
for key in ["key", "value"]
}
inputs_embeds = torch.ones(
batch_size, sequence_length, hidden_size, dtype=torch.float32
)
attention_mask = torch.ones(batch_size, sequence_length, dtype=torch.int64)
position_ids = torch.ones(3, batch_size, sequence_length, dtype=torch.int64)
## Vision inputs
grid_thw = torch.tensor(
[[grid_t, grid_h, grid_w]] * image_batch_size, dtype=torch.int64
)
pixel_values = torch.randn(
image_batch_size * grid_t * grid_h * grid_w,
channel * temporal_patch_size * patch_size * patch_size,
dtype=torch.float32,
)
# ONNX Exports
## Embedding model
embedding_inputs = dict(input_ids=input_ids)
embedding_inputs_positional = tuple(embedding_inputs.values())
model.model.embed_tokens(*embedding_inputs_positional) # Test forward pass
EMBED_TOKENS_OUTPUT_PATH = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, EMBEDDING_MODEL_NAME)
torch.onnx.export(
model.model.embed_tokens,
args=embedding_inputs_positional,
f=EMBED_TOKENS_OUTPUT_PATH,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=list(embedding_inputs.keys()),
output_names=["inputs_embeds"],
dynamic_axes={
"input_ids": {0: "batch_size", 1: "sequence_length"},
"inputs_embeds": {0: "batch_size", 1: "sequence_length"},
},
)
## Text model
text_inputs = dict(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
**dummy_past_key_values_kwargs,
)
text_inputs_positional = tuple(text_inputs.values())
text_outputs = model.forward(*text_inputs_positional) # Test forward pass
TEXT_MODEL_OUTPUT_PATH=os.path.join(TEMP_MODEL_OUTPUT_FOLDER, TEXT_MODEL_NAME)
torch.onnx.export(
model,
args=text_inputs_positional,
f=TEXT_MODEL_OUTPUT_PATH,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=list(text_inputs.keys()),
output_names=["logits"]
+ [f"present.{i}.{key}" for i in range(num_layers) for key in ["key", "value"]],
dynamic_axes={
"inputs_embeds": {0: "batch_size", 1: "sequence_length"},
"attention_mask": {0: "batch_size", 1: "sequence_length"},
"position_ids": {1: "batch_size", 2: "sequence_length"},
**{
f"past_key_values.{i}.{key}": {0: "batch_size", 2: "past_sequence_length"}
for i in range(num_layers)
for key in ["key", "value"]
},
"logits": {0: "batch_size", 1: "sequence_length"},
**{
f"present.{i}.{key}": {0: "batch_size", 2: "past_sequence_length + 1"}
for i in range(num_layers)
for key in ["key", "value"]
},
},
)
## Vision model
vision_inputs = dict(
pixel_values=pixel_values,
grid_thw=grid_thw,
)
vision_inputs_positional = tuple(vision_inputs.values())
vision_outputs = model.visual.forward(*vision_inputs_positional) # Test forward pass
VISION_ENCODER_OUTPUT_PATH = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, VISION_MODEL_NAME)
torch.onnx.export(
model.visual,
args=vision_inputs_positional,
f=VISION_ENCODER_OUTPUT_PATH,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=list(vision_inputs.keys()),
output_names=["image_features"],
dynamic_axes={
"pixel_values": {
0: "batch_size * grid_t * grid_h * grid_w",
1: "channel * temporal_patch_size * patch_size * patch_size",
},
"grid_thw": {0: "batch_size"},
"image_features": {0: "batch_size * grid_t * grid_h * grid_w"},
},
)
# Post-processing
import onnx
import onnxslim
from optimum.onnx.graph_transformations import check_and_save_model
os.makedirs(FINAL_MODEL_OUTPUT_FOLDER, exist_ok=True)
for name in (EMBEDDING_MODEL_NAME, TEXT_MODEL_NAME, VISION_MODEL_NAME):
temp_model_path = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, name)
## Shape inference (especially needed by the vision encoder)
onnx.shape_inference.infer_shapes_path(temp_model_path, check_type=True, strict_mode=True)
## Attempt to optimize the model with onnxslim
try:
model = onnxslim.slim(temp_model_path)
except Exception as e:
print(f"Failed to slim {model}: {e}")
model = onnx.load(temp_model_path)
## Save model
final_model_path = os.path.join(FINAL_MODEL_OUTPUT_FOLDER, name)
check_and_save_model(model, final_model_path)
## Cleanup
import shutil
shutil.rmtree(TEMP_MODEL_OUTPUT_FOLDER)