Xenova HF staff commited on
Commit
6afabdc
1 Parent(s): c322a78

Add transformers.js tag + sample code

Browse files
Files changed (1) hide show
  1. README.md +39 -0
README.md CHANGED
@@ -5,6 +5,7 @@ tags:
5
  - mteb
6
  - clip
7
  - vision
 
8
  language: en
9
  inference: false
10
  license: apache-2.0
@@ -77,6 +78,44 @@ print(cos_sim(text_embeddings[1], image_embeddings[0])) # text-image cross-modal
77
  print(cos_sim(text_embeddings[1], image_embeddings[1])) # text-image cross-modal similarity
78
  ```
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  ## Performance
82
 
 
5
  - mteb
6
  - clip
7
  - vision
8
+ - transformers.js
9
  language: en
10
  inference: false
11
  license: apache-2.0
 
78
  print(cos_sim(text_embeddings[1], image_embeddings[1])) # text-image cross-modal similarity
79
  ```
80
 
81
+ 3. JavaScript developers can use Jina CLIP via the [Transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install Transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
82
+
83
+ ```js
84
+ import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, cos_sim } from '@xenova/transformers';
85
+
86
+ // Load tokenizer and text model
87
+ const tokenizer = await AutoTokenizer.from_pretrained('jinaai/jina-clip-v1');
88
+ const text_model = await CLIPTextModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
89
+
90
+ // Load processor and vision model
91
+ const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
92
+ const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
93
+
94
+ // Run tokenization
95
+ const texts = ['Bridge close-shot', 'Bridge in far away'];
96
+ const text_inputs = tokenizer(texts, { padding: true, truncation: true });
97
+
98
+ // Compute text embeddings
99
+ const { text_embeds } = await text_model(text_inputs);
100
+
101
+ // Read images and run processor
102
+ const urls = [
103
+ 'https://fastly.picsum.photos/id/74/4288/2848.jpg?hmac=q02MzzHG23nkhJYRXR-_RgKTr6fpfwRgcXgE0EKvNB8',
104
+ 'https://fastly.picsum.photos/id/84/1280/848.jpg?hmac=YFRYDI4UsfbeTzI8ZakNOR98wVU7a-9a2tGF542539s',
105
+ ];
106
+ const image = await Promise.all(urls.map(url => RawImage.read(url)));
107
+ const image_inputs = await processor(image);
108
+
109
+ // Compute vision embeddings
110
+ const { image_embeds } = await vision_model(image_inputs);
111
+
112
+ // Compute similarities
113
+ console.log(cos_sim(text_embeds[0].data, text_embeds[1].data)) // text embedding similarity
114
+ console.log(cos_sim(text_embeds[0].data, image_embeds[0].data)) // text-image cross-modal similarity
115
+ console.log(cos_sim(text_embeds[0].data, image_embeds[1].data)) // text-image cross-modal similarity
116
+ console.log(cos_sim(text_embeds[1].data, image_embeds[0].data)) // text-image cross-modal similarity
117
+ console.log(cos_sim(text_embeds[1].data, image_embeds[1].data)) // text-image cross-modal similarity
118
+ ```
119
 
120
  ## Performance
121