AIRI-Institute
/

OmniFusion

Safetensors

Model card Files Files and versions Community

razzant commited on Dec 29, 2023

Commit

070912c

•

1 Parent(s): c853b0e

Update README.md

Browse files

Files changed (1) hide show

README.md +77 -0

README.md CHANGED Viewed

@@ -48,6 +48,83 @@ Model Performance on Visual Dialog Benchmark
 <img src="https://raw.githubusercontent.com/AIRI-Institute/OmniFusion/main/content/examples.png" width="100%">
 </p>
 ### Future Plans
 Work is underway on a version that understands Russian, uses ImageBind encoders, and accepts more modalities (sound, 3D, video). Stay tuned for updates on GitHub!

 <img src="https://raw.githubusercontent.com/AIRI-Institute/OmniFusion/main/content/examples.png" width="100%">
 </p>
+### How to Use
+```python
+import torch
+from PIL import Image
+from clip_encoder import CLIPVisionTower
+from transformers import AutoTokenizer, AutoModelForCausalLM
+DEVICE = "cuda:0"
+PROMPT = "This is a dialog with AI assistant.\n"
+tokenizer = AutoTokenizer.from_pretrained("OmniMistral-tokenizer", use_fast=False)
+model = AutoModelForCausalLM.from_pretrained("OmniMistral-model", torch_dtype=torch.bfloat16, device_map=DEVICE)
+clip = CLIPVisionTower("openai/clip-vit-large-patch14-336")
+clip.load_model()
+clip = clip.to(device=DEVICE, dtype=torch.bfloat16)
+def gen_answer(model, tokenizer, clip, projection, query, special_embs, image=None):
+    bad_words_ids = tokenizer(["\n", "</s>", ":"], add_special_tokens=False).input_ids + [[13]]
+    gen_params = {
+        "do_sample": False,
+        "max_new_tokens": 50,
+        "early_stopping": True,
+        "num_beams": 3,
+        "repetition_penalty": 1.0,
+        "remove_invalid_values": True,
+        "eos_token_id": 2,
+        "pad_token_id": 2,
+        "forced_eos_token_id": 2,
+        "use_cache": True,
+        "no_repeat_ngram_size": 4,
+        "bad_words_ids": bad_words_ids,
+        "num_return_sequences": 1,
+    }
+    with torch.no_grad():
+        image_features = clip.module.image_processor(image, return_tensors='pt')
+        image_embedding = clip(image_features['pixel_values']).to(device=DEVICE, dtype=torch.bfloat16)
+        projected_vision_embeddings = projection(image_embedding).to(device=DEVICE, dtype=torch.bfloat16)
+        prompt_ids = tokenizer.encode(f"{PROMPT}", add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
+        question_ids = tokenizer.encode(query, add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
+        prompt_embeddings = model.model.embed_tokens(prompt_ids).to(torch.bfloat16)
+        question_embeddings = model.model.embed_tokens(question_ids).to(torch.bfloat16)
+        embeddings = torch.cat(
+            [
+                prompt_embeddings,
+                special_embs['SOI'][None, None, ...],
+                projected_vision_embeddings,
+                special_embs['EOI'][None, None, ...],
+                special_embs['USER'][None, None, ...],
+                question_embeddings,
+                special_embs['BOT'][None, None, ...]
+            ],
+            dim=1,
+        ).to(dtype=torch.bfloat16, device=DEVICE)
+        out = model.generate(inputs_embeds=embeddings, **gen_params)
+    out = out[:, 1:]
+    generated_texts = tokenizer.batch_decode(out)[0]
+    return generated_texts
+answer = gen_answer(
+    model,
+    tokenizer,
+    clip,
+    projection,
+    query = "who is the author?",
+    special_embs,
+    Image.open("https://i.pinimg.com/originals/32/c7/81/32c78115cb47fd4825e6907a83b7afff.jpg")
+)
+print(answer)
+```
 ### Future Plans
 Work is underway on a version that understands Russian, uses ImageBind encoders, and accepts more modalities (sound, 3D, video). Stay tuned for updates on GitHub!