first update
Browse files- README.md +65 -0
- added_tokens.json +7 -0
- open_clip_config.json +41 -0
- open_clip_pytorch_model.bin +3 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +56 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- clip
|
4 |
+
library_name: open_clip
|
5 |
+
pipeline_tag: zero-shot-image-classification
|
6 |
+
license: cc-by-4.0
|
7 |
+
datasets:
|
8 |
+
- mlfoundations/datacomp_1b
|
9 |
+
---
|
10 |
+
# Model card for ViT-H-14-CLIPA-336-datacomp1B
|
11 |
+
|
12 |
+
A CLIPA model trained on Recap-DataComp-1B...
|
13 |
+
|
14 |
+
## Model Details
|
15 |
+
- **Model Type:** Contrastive Image-Text, Zero-Shot Image Classification.
|
16 |
+
- **Original:** https://github.com/UCSC-VLAA/Recap-DataComp-1B
|
17 |
+
- **Dataset:** https://huggingface.co/datasets/UCSC-VLAA/Recap-DataComp-1B
|
18 |
+
- **Papers:**
|
19 |
+
- What If We Recaption Billions of Web Images with LLaMA-3?: https://arxiv.org/abs/2406.08478
|
20 |
+
|
21 |
+
## Model Usage
|
22 |
+
### With OpenCLIP
|
23 |
+
```
|
24 |
+
import torch
|
25 |
+
import torch.nn.functional as F
|
26 |
+
from urllib.request import urlopen
|
27 |
+
from PIL import Image
|
28 |
+
from open_clip import create_model_from_pretrained, get_tokenizer
|
29 |
+
|
30 |
+
model, preprocess = create_model_from_pretrained('hf-hub:UCSC-VLAA/ViT-L-16-HTxt-Recap-CLIP')
|
31 |
+
tokenizer = get_tokenizer('hf-hub:UCSC-VLAA/ViT-L-16-HTxt-Recap-CLIP')
|
32 |
+
|
33 |
+
image = Image.open(urlopen(
|
34 |
+
'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
|
35 |
+
))
|
36 |
+
image = preprocess(image).unsqueeze(0)
|
37 |
+
|
38 |
+
text = tokenizer(["a diagram", "a dog", "a cat", "a beignet"], context_length=model.context_length)
|
39 |
+
|
40 |
+
with torch.no_grad(), torch.cuda.amp.autocast():
|
41 |
+
image_features = model.encode_image(image)
|
42 |
+
text_features = model.encode_text(text)
|
43 |
+
image_features = F.normalize(image_features, dim=-1)
|
44 |
+
text_features = F.normalize(text_features, dim=-1)
|
45 |
+
|
46 |
+
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
47 |
+
|
48 |
+
print("Label probs:", text_probs) # prints: [[0., 0., 0., 1.0]]
|
49 |
+
```
|
50 |
+
|
51 |
+
## Bias, Risks, and Limitations
|
52 |
+
This model is trained on image-text dataset with LLaVA-1.5-LLaMA3-8B generated captions, which may still contain biases and inaccuracies inherent in the original web-crawled data.
|
53 |
+
Users should be aware of the bias, risks, or limitations when using this model. check the [dataset card](https://huggingface.co/datasets/UCSC-VLAA/Recap-DataComp-1B) page for more details.
|
54 |
+
|
55 |
+
## Citation
|
56 |
+
```bibtex
|
57 |
+
@article{li2024recaption,
|
58 |
+
title={What If We Recaption Billions of Web Images with LLaMA-3?},
|
59 |
+
author={Xianhang Li and Haoqin Tu and Mude Hui and Zeyu Wang and Bingchen Zhao and Junfei Xiao and Sucheng Ren and Jieru Mei and Qing Liu and Huangjie Zheng and Yuyin Zhou and Cihang Xie},
|
60 |
+
journal={arXiv preprint arXiv:2406.08478},
|
61 |
+
year={2024}
|
62 |
+
}
|
63 |
+
|
64 |
+
```
|
65 |
+
|
added_tokens.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[CLS]": 101,
|
3 |
+
"[MASK]": 103,
|
4 |
+
"[PAD]": 0,
|
5 |
+
"[SEP]": 102,
|
6 |
+
"[UNK]": 100
|
7 |
+
}
|
open_clip_config.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_cfg": {
|
3 |
+
"embed_dim": 768,
|
4 |
+
"vision_cfg": {
|
5 |
+
"image_size": 224,
|
6 |
+
"layers": 24,
|
7 |
+
"width": 1024,
|
8 |
+
"patch_size": 16,
|
9 |
+
"no_ln_pre": true,
|
10 |
+
"pool_type": "avg",
|
11 |
+
"final_ln_after_pool": true
|
12 |
+
},
|
13 |
+
"text_cfg": {
|
14 |
+
"context_length": 128,
|
15 |
+
"vocab_size": 32000,
|
16 |
+
"hf_tokenizer_name": "bert-base-uncased",
|
17 |
+
"tokenizer_kwargs": {
|
18 |
+
"strip_sep_token": true
|
19 |
+
},
|
20 |
+
"width": 1024,
|
21 |
+
"heads": 16,
|
22 |
+
"layers": 24,
|
23 |
+
"pool_type": "last",
|
24 |
+
"no_causal_mask": true
|
25 |
+
}
|
26 |
+
},
|
27 |
+
"preprocess_cfg": {
|
28 |
+
"mean": [
|
29 |
+
0.485,
|
30 |
+
0.456,
|
31 |
+
0.406
|
32 |
+
],
|
33 |
+
"std": [
|
34 |
+
0.229,
|
35 |
+
0.224,
|
36 |
+
0.225
|
37 |
+
],
|
38 |
+
"interpolation": "bilinear",
|
39 |
+
"resize_mode": "squash"
|
40 |
+
}
|
41 |
+
}
|
open_clip_pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a464d3487e29707d9240f30e1ac1dda3f3fdff2ac52dbf69ed3a85f2ca108f79
|
3 |
+
size 2560522281
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"additional_special_tokens": [],
|
45 |
+
"clean_up_tokenization_spaces": true,
|
46 |
+
"cls_token": "[CLS]",
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"pad_token": "[PAD]",
|
51 |
+
"sep_token": "[SEP]",
|
52 |
+
"strip_accents": null,
|
53 |
+
"tokenize_chinese_chars": true,
|
54 |
+
"tokenizer_class": "BertTokenizer",
|
55 |
+
"unk_token": "[UNK]"
|
56 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|