Update README.md
Browse files
README.md
CHANGED
@@ -32,15 +32,23 @@ Truncate to 77 tokens
|
|
32 |
tensor([[0.16484, 0.0749, 0.1618, 0.0774]], device='cuda:0') π
|
33 |
```
|
34 |
# π
|
35 |
-
# Option 2
|
36 |
|
37 |
-
-
|
38 |
-
-
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
```
|
46 |
# Resulting Cosine Similarities for 248 tokens padded:
|
|
|
32 |
tensor([[0.16484, 0.0749, 0.1618, 0.0774]], device='cuda:0') π
|
33 |
```
|
34 |
# π
|
35 |
+
# Option 2, proper integration: π RECOMMENDED π
|
36 |
|
37 |
+
- ### Solution for implementation of 248 tokens / thanks [@kk3dmax ](https://huggingface.co/zer0int/LongCLIP-GmP-ViT-L-14/discussions/3) π€
|
38 |
+
- Obtain a full example script using this solution for Flux.1 inference on [my GitHub](https://github.com/zer0int/CLIP-txt2img-diffusers-scripts)
|
39 |
|
40 |
+
```
|
41 |
+
model_id = ("zer0int/LongCLIP-GmP-ViT-L-14")
|
42 |
+
config = CLIPConfig.from_pretrained(model_id)
|
43 |
+
config.text_config.max_position_embeddings = 248
|
44 |
+
clip_model = CLIPModel.from_pretrained(model_id, torch_dtype=dtype, config=config)
|
45 |
+
clip_processor = CLIPProcessor.from_pretrained(model_id, padding="max_length", max_length=248)
|
46 |
+
|
47 |
+
pipe.tokenizer = clip_processor.tokenizer # Replace with the CLIP tokenizer
|
48 |
+
pipe.text_encoder = clip_model.text_model # Replace with the CLIP text encoder
|
49 |
+
pipe.tokenizer_max_length = 248
|
50 |
+
pipe.text_encoder.dtype = torch.bfloat16
|
51 |
+
```
|
52 |
|
53 |
```
|
54 |
# Resulting Cosine Similarities for 248 tokens padded:
|