Update README.md
Browse files
README.md
CHANGED
@@ -48,6 +48,83 @@ Model Performance on Visual Dialog Benchmark
|
|
48 |
<img src="https://raw.githubusercontent.com/AIRI-Institute/OmniFusion/main/content/examples.png" width="100%">
|
49 |
</p>
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
### Future Plans
|
52 |
|
53 |
Work is underway on a version that understands Russian, uses ImageBind encoders, and accepts more modalities (sound, 3D, video). Stay tuned for updates on GitHub!
|
|
|
48 |
<img src="https://raw.githubusercontent.com/AIRI-Institute/OmniFusion/main/content/examples.png" width="100%">
|
49 |
</p>
|
50 |
|
51 |
+
### How to Use
|
52 |
+
|
53 |
+
```python
|
54 |
+
import torch
|
55 |
+
from PIL import Image
|
56 |
+
from clip_encoder import CLIPVisionTower
|
57 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
58 |
+
|
59 |
+
|
60 |
+
DEVICE = "cuda:0"
|
61 |
+
PROMPT = "This is a dialog with AI assistant.\n"
|
62 |
+
tokenizer = AutoTokenizer.from_pretrained("OmniMistral-tokenizer", use_fast=False)
|
63 |
+
model = AutoModelForCausalLM.from_pretrained("OmniMistral-model", torch_dtype=torch.bfloat16, device_map=DEVICE)
|
64 |
+
|
65 |
+
clip = CLIPVisionTower("openai/clip-vit-large-patch14-336")
|
66 |
+
clip.load_model()
|
67 |
+
clip = clip.to(device=DEVICE, dtype=torch.bfloat16)
|
68 |
+
|
69 |
+
def gen_answer(model, tokenizer, clip, projection, query, special_embs, image=None):
|
70 |
+
bad_words_ids = tokenizer(["\n", "</s>", ":"], add_special_tokens=False).input_ids + [[13]]
|
71 |
+
gen_params = {
|
72 |
+
"do_sample": False,
|
73 |
+
"max_new_tokens": 50,
|
74 |
+
"early_stopping": True,
|
75 |
+
"num_beams": 3,
|
76 |
+
"repetition_penalty": 1.0,
|
77 |
+
"remove_invalid_values": True,
|
78 |
+
"eos_token_id": 2,
|
79 |
+
"pad_token_id": 2,
|
80 |
+
"forced_eos_token_id": 2,
|
81 |
+
"use_cache": True,
|
82 |
+
"no_repeat_ngram_size": 4,
|
83 |
+
"bad_words_ids": bad_words_ids,
|
84 |
+
"num_return_sequences": 1,
|
85 |
+
}
|
86 |
+
with torch.no_grad():
|
87 |
+
image_features = clip.module.image_processor(image, return_tensors='pt')
|
88 |
+
image_embedding = clip(image_features['pixel_values']).to(device=DEVICE, dtype=torch.bfloat16)
|
89 |
+
|
90 |
+
projected_vision_embeddings = projection(image_embedding).to(device=DEVICE, dtype=torch.bfloat16)
|
91 |
+
prompt_ids = tokenizer.encode(f"{PROMPT}", add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
|
92 |
+
question_ids = tokenizer.encode(query, add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
|
93 |
+
|
94 |
+
prompt_embeddings = model.model.embed_tokens(prompt_ids).to(torch.bfloat16)
|
95 |
+
question_embeddings = model.model.embed_tokens(question_ids).to(torch.bfloat16)
|
96 |
+
|
97 |
+
embeddings = torch.cat(
|
98 |
+
[
|
99 |
+
prompt_embeddings,
|
100 |
+
special_embs['SOI'][None, None, ...],
|
101 |
+
projected_vision_embeddings,
|
102 |
+
special_embs['EOI'][None, None, ...],
|
103 |
+
special_embs['USER'][None, None, ...],
|
104 |
+
question_embeddings,
|
105 |
+
special_embs['BOT'][None, None, ...]
|
106 |
+
],
|
107 |
+
dim=1,
|
108 |
+
).to(dtype=torch.bfloat16, device=DEVICE)
|
109 |
+
out = model.generate(inputs_embeds=embeddings, **gen_params)
|
110 |
+
out = out[:, 1:]
|
111 |
+
generated_texts = tokenizer.batch_decode(out)[0]
|
112 |
+
return generated_texts
|
113 |
+
|
114 |
+
|
115 |
+
answer = gen_answer(
|
116 |
+
model,
|
117 |
+
tokenizer,
|
118 |
+
clip,
|
119 |
+
projection,
|
120 |
+
query = "who is the author?",
|
121 |
+
special_embs,
|
122 |
+
Image.open("https://i.pinimg.com/originals/32/c7/81/32c78115cb47fd4825e6907a83b7afff.jpg")
|
123 |
+
)
|
124 |
+
|
125 |
+
print(answer)
|
126 |
+
```
|
127 |
+
|
128 |
### Future Plans
|
129 |
|
130 |
Work is underway on a version that understands Russian, uses ImageBind encoders, and accepts more modalities (sound, 3D, video). Stay tuned for updates on GitHub!
|