chs20 commited on
Commit
737e332
1 Parent(s): 7716053
README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ license: mit
4
+ library_name: open_clip
5
+ pipeline_tag: zero-shot-image-classification
6
+ ---
7
+ [[Paper]](https://openreview.net/forum?id=e3scLKNiNg&noteId=e3scLKNiNg) [[GitHub]](https://github.com/fra31/perceptual-metrics)
8
+
9
+ Robust perceptual metric, based on CLIP model `laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg`
10
+
11
+ Adversarially fine-tuned with FARE ([Schlarmann et al. (2024)](https://arxiv.org/abs/2402.12336)) on ImageNet with infinity-norm and radius 4/255.
12
+
13
+ Performance on the perceptual similarity task [NIGHTS](https://dreamsim-nights.github.io):
14
+ ```
15
+ Clean L-inf, eps=4/255 L2, eps=3
16
+ 90.6 74.3 66.1
17
+ ```
18
+
19
+ ## Usage
20
+ ```python
21
+ model, _, image_processor = open_clip.create_model_and_transforms('hf-hub:chs20/FARE4-convnext_base_w-laion2B-s13B-b82K-augreg')
22
+ ```
23
+
24
+ ## Citation
25
+ If you find this model useful, please consider citing our papers:
26
+ ```bibtex
27
+ @inproceedings{croce2024adversarially,
28
+ title={Adversarially Robust CLIP Models Induce Better (Robust) Perceptual Metrics},
29
+ author={Croce, Francesco and Schlarmann, Christian and Singh, Naman Deep and Hein, Matthias},
30
+ year={2024},
31
+ booktitle={{ICML Workshop on Foundation Models in the Wild}}
32
+ }
33
+ ```
34
+
35
+ ```bibtex
36
+ @inproceedings{schlarmann2024robustclip,
37
+ title={Robust CLIP: Unsupervised Adversarial Fine-Tuning of Vision Embeddings for Robust Large Vision-Language Models},
38
+ author={Schlarmann, Christian and Singh, Naman Deep and Croce, Francesco and Hein, Matthias},
39
+ year={2024},
40
+ booktitle={{ICML}}
41
+ }
42
+ ```
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
open_clip_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_cfg": {
3
+ "embed_dim": 640,
4
+ "vision_cfg": {
5
+ "timm_model_name": "convnext_base",
6
+ "timm_model_pretrained": false,
7
+ "timm_pool": "",
8
+ "timm_proj": "linear",
9
+ "timm_drop": 0.0,
10
+ "timm_drop_path": 0.1,
11
+ "image_size": 256
12
+ },
13
+ "text_cfg": {
14
+ "context_length": 77,
15
+ "vocab_size": 49408,
16
+ "width": 640,
17
+ "heads": 10,
18
+ "layers": 12
19
+ }
20
+ },
21
+ "preprocess_cfg": {
22
+ "mean": [
23
+ 0.48145466,
24
+ 0.4578275,
25
+ 0.40821073
26
+ ],
27
+ "std": [
28
+ 0.26862954,
29
+ 0.26130258,
30
+ 0.27577711
31
+ ],
32
+ "interpolation": "bicubic",
33
+ "resize_mode": "shortest"
34
+ }
35
+ }
open_clip_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26360ef11b916ec985996655773a271f6d4a28e417aed4aa212cdf8b671194ba
3
+ size 717597012
open_clip_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eb837189a0d74c3c484a7b6eda776cd6734cc0b64f6bd3cd3c7f5fedb62ccec
3
+ size 717742056
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "tokenizer_class": "CLIPTokenizer",
29
+ "unk_token": "<|endoftext|>"
30
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff