Upload ONNX weights (+ quantizations) + Transformers.js support
#1
by
Xenova
HF staff
- opened
- README.md +53 -0
- config.json +3 -0
- onnx/text_model.onnx +3 -0
- onnx/text_model_bnb4.onnx +3 -0
- onnx/text_model_fp16.onnx +3 -0
- onnx/text_model_int8.onnx +3 -0
- onnx/text_model_q4.onnx +3 -0
- onnx/text_model_q4f16.onnx +3 -0
- onnx/text_model_quantized.onnx +3 -0
- onnx/text_model_uint8.onnx +3 -0
- onnx/vision_model.onnx +3 -0
- onnx/vision_model_bnb4.onnx +3 -0
- onnx/vision_model_fp16.onnx +3 -0
- onnx/vision_model_int8.onnx +3 -0
- onnx/vision_model_q4.onnx +3 -0
- onnx/vision_model_q4f16.onnx +3 -0
- onnx/vision_model_quantized.onnx +3 -0
- onnx/vision_model_uint8.onnx +3 -0
- preprocessor_config.json +23 -0
- tokenizer_config.json +1 -1
README.md
CHANGED
@@ -5,6 +5,7 @@ tags:
|
|
5 |
- fashion
|
6 |
- multimodal retrieval
|
7 |
- siglip
|
|
|
8 |
library_name: open_clip
|
9 |
pipeline_tag: zero-shot-image-classification
|
10 |
license: apache-2.0
|
@@ -25,6 +26,9 @@ The model was fine-tuned from ViT-B-16-SigLIP (webli).
|
|
25 |
|
26 |
|
27 |
## Usage
|
|
|
|
|
|
|
28 |
The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
|
29 |
|
30 |
```python
|
@@ -49,6 +53,55 @@ with torch.no_grad(), torch.cuda.amp.autocast():
|
|
49 |
print("Label probs:", text_probs)
|
50 |
```
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
## Benchmark Results
|
53 |
Average evaluation results on 6 public multimodal fashion datasets ([Atlas](https://huggingface.co/datasets/Marqo/atlas), [DeepFashion (In-shop)](https://huggingface.co/datasets/Marqo/deepfashion-inshop), [DeepFashion (Multimodal)](https://huggingface.co/datasets/Marqo/deepfashion-multimodal), [Fashion200k](https://huggingface.co/datasets/Marqo/fashion200k), [KAGL](https://huggingface.co/datasets/Marqo/KAGL), and [Polyvore](https://huggingface.co/datasets/Marqo/polyvore)) are reported below:
|
54 |
|
|
|
5 |
- fashion
|
6 |
- multimodal retrieval
|
7 |
- siglip
|
8 |
+
- transformers.js
|
9 |
library_name: open_clip
|
10 |
pipeline_tag: zero-shot-image-classification
|
11 |
license: apache-2.0
|
|
|
26 |
|
27 |
|
28 |
## Usage
|
29 |
+
|
30 |
+
### OpenCLIP
|
31 |
+
|
32 |
The model can be seamlessly used with [OpenCLIP](https://github.com/mlfoundations/open_clip) by
|
33 |
|
34 |
```python
|
|
|
53 |
print("Label probs:", text_probs)
|
54 |
```
|
55 |
|
56 |
+
### Transformers.js
|
57 |
+
|
58 |
+
You can also run the model in JavaScript with the [Transformers.js](https://huggingface.co/docs/transformers.js) library.
|
59 |
+
|
60 |
+
First, install it from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
|
61 |
+
|
62 |
+
```bash
|
63 |
+
npm i @huggingface/transformers
|
64 |
+
```
|
65 |
+
|
66 |
+
Then, compute embeddings as follows:
|
67 |
+
```js
|
68 |
+
import { SiglipTextModel, SiglipVisionModel, AutoTokenizer, AutoProcessor, RawImage, softmax, dot } from '@huggingface/transformers';
|
69 |
+
|
70 |
+
const model_id = 'Marqo/marqo-fashionSigLIP';
|
71 |
+
|
72 |
+
// Load tokenizer and text model
|
73 |
+
const tokenizer = await AutoTokenizer.from_pretrained(model_id);
|
74 |
+
const text_model = await SiglipTextModel.from_pretrained(model_id);
|
75 |
+
|
76 |
+
// Load processor and vision model
|
77 |
+
const processor = await AutoProcessor.from_pretrained(model_id);
|
78 |
+
const vision_model = await SiglipVisionModel.from_pretrained(model_id);
|
79 |
+
|
80 |
+
// Run tokenization
|
81 |
+
const texts = ['a hat', 'a t-shirt', 'shoes'];
|
82 |
+
const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true });
|
83 |
+
|
84 |
+
// Compute text embeddings
|
85 |
+
const { text_embeds } = await text_model(text_inputs);
|
86 |
+
|
87 |
+
// Read image and run processor
|
88 |
+
const image = await RawImage.read('https://raw.githubusercontent.com/marqo-ai/marqo-FashionCLIP/main/docs/fashion-hippo.png');
|
89 |
+
const image_inputs = await processor(image);
|
90 |
+
|
91 |
+
// Compute vision embeddings
|
92 |
+
const { image_embeds } = await vision_model(image_inputs);
|
93 |
+
|
94 |
+
// Compute similarity scores
|
95 |
+
const normalized_text_embeds = text_embeds.normalize().tolist();
|
96 |
+
const normalized_image_embeds = image_embeds.normalize().tolist()[0];
|
97 |
+
|
98 |
+
const text_probs = softmax(normalized_text_embeds.map((text_embed) =>
|
99 |
+
100.0 * dot(normalized_image_embeds, text_embed)
|
100 |
+
));
|
101 |
+
console.log(text_probs);
|
102 |
+
// [0.9860219105287394, 0.00777916527489097, 0.006198924196369721]
|
103 |
+
```
|
104 |
+
|
105 |
## Benchmark Results
|
106 |
Average evaluation results on 6 public multimodal fashion datasets ([Atlas](https://huggingface.co/datasets/Marqo/atlas), [DeepFashion (In-shop)](https://huggingface.co/datasets/Marqo/deepfashion-inshop), [DeepFashion (Multimodal)](https://huggingface.co/datasets/Marqo/deepfashion-multimodal), [Fashion200k](https://huggingface.co/datasets/Marqo/fashion200k), [KAGL](https://huggingface.co/datasets/Marqo/KAGL), and [Polyvore](https://huggingface.co/datasets/Marqo/polyvore)) are reported below:
|
107 |
|
config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_type": "siglip"
|
3 |
+
}
|
onnx/text_model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1d501b23bddf27ba828c037b0780e44fdf47ca4c0b925ef190ab5bcf7aaf6e6
|
3 |
+
size 441361402
|
onnx/text_model_bnb4.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4be58194517cdafe1695c2675259ffdba8871b77f8c5828cd45598177453f5d5
|
3 |
+
size 173734396
|
onnx/text_model_fp16.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f5fcccd805e8910663dcbd6821c3cbe040bbe508c656964de08736272228806
|
3 |
+
size 220817780
|
onnx/text_model_int8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5a7995e029c6ee9346fa46857661c4a171d28b164fa7703b85a527d73adf170
|
3 |
+
size 111125229
|
onnx/text_model_q4.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f62fd2b046d82ec1ed91881fe9f4f0b34d4e11bae539a33092712031ffee129
|
3 |
+
size 178600156
|
onnx/text_model_q4f16.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e48e36d0acdc1579aac8edb3c593a3f2ef9d55f7646b9e3acd7baf8f00d7d0ce
|
3 |
+
size 108904023
|
onnx/text_model_quantized.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5a7995e029c6ee9346fa46857661c4a171d28b164fa7703b85a527d73adf170
|
3 |
+
size 111125229
|
onnx/text_model_uint8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e45a29f61825b0fdc4ab2648c84791d6af1b63fe86ce7bd7c7fee43fc3b1c4d
|
3 |
+
size 111125261
|
onnx/vision_model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7e773846b27a699c45ba7e3978514b7fca420662d7e69e3b9226982f09f4a13
|
3 |
+
size 371715502
|
onnx/vision_model_bnb4.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7724073a1af61260bc4d250492ff1d2bfbf2e4d4e171e90c5044186e2198948
|
3 |
+
size 55430656
|
onnx/vision_model_fp16.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6d3e644416e543c62344c06f7dc8be5687633901a2931faccbe28688e30737e
|
3 |
+
size 185947013
|
onnx/vision_model_int8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dae6e934667af50590c3d0ca69c7b14edc71f4df1b4f670e5ba6bc623495b691
|
3 |
+
size 93973410
|
onnx/vision_model_q4.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a864016eff2d8829ec6088b1593c3c9e75a70e0413e0f02d4a4bbfddc3ef89d3
|
3 |
+
size 61181030
|
onnx/vision_model_q4f16.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:988c55937d35228f860bcc90742940ec3efe8d00054518249f6c82b66e1b4a7c
|
3 |
+
size 53686874
|
onnx/vision_model_quantized.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1009cfce0eedd409f601e8351eedab72e8529641105eee6517821a9a634a2f4
|
3 |
+
size 93973443
|
onnx/vision_model_uint8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1009cfce0eedd409f601e8351eedab72e8529641105eee6517821a9a634a2f4
|
3 |
+
size 93973443
|
preprocessor_config.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"do_rescale": true,
|
4 |
+
"do_resize": true,
|
5 |
+
"image_processor_type": "SiglipImageProcessor",
|
6 |
+
"image_mean": [
|
7 |
+
0.5,
|
8 |
+
0.5,
|
9 |
+
0.5
|
10 |
+
],
|
11 |
+
"processor_class": "SiglipProcessor",
|
12 |
+
"resample": 3,
|
13 |
+
"rescale_factor": 0.00392156862745098,
|
14 |
+
"size": {
|
15 |
+
"height": 224,
|
16 |
+
"width": 224
|
17 |
+
},
|
18 |
+
"image_std": [
|
19 |
+
0.5,
|
20 |
+
0.5,
|
21 |
+
0.5
|
22 |
+
]
|
23 |
+
}
|
tokenizer_config.json
CHANGED
@@ -931,7 +931,7 @@
|
|
931 |
"eos_token": "</s>",
|
932 |
"extra_ids": 100,
|
933 |
"legacy": false,
|
934 |
-
"model_max_length":
|
935 |
"pad_token": "</s>",
|
936 |
"sp_model_kwargs": {},
|
937 |
"tokenizer_class": "T5Tokenizer",
|
|
|
931 |
"eos_token": "</s>",
|
932 |
"extra_ids": 100,
|
933 |
"legacy": false,
|
934 |
+
"model_max_length": 64,
|
935 |
"pad_token": "</s>",
|
936 |
"sp_model_kwargs": {},
|
937 |
"tokenizer_class": "T5Tokenizer",
|