AIRI-Institute
/

OmniFusion

Safetensors

Model card Files Files and versions Community

matveymih commited on Jan 12

Commit

9da2e57

•

1 Parent(s): 8ea3dd9

Updated "How to Use" python example

Browse files

Files changed (1) hide show

README.md +18 -88

README.md CHANGED Viewed

@@ -56,96 +56,26 @@ from PIL import Image
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from urllib.request import urlopen
 import torch.nn as nn
-from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 DEVICE = "cuda:0"
 PROMPT = "This is a dialog with AI assistant.\n"
-tokenizer = AutoTokenizer.from_pretrained("OmniMistral-tokenizer", use_fast=False)
-model = AutoModelForCausalLM.from_pretrained("OmniMistral-model", torch_dtype=torch.bfloat16, device_map=DEVICE)
 projection = torch.load("projection", map_location=DEVICE)
 special_embs = torch.load("special_embeddings.pt", map_location=DEVICE)
-class CLIPVisionTower(nn.Module):
-    def __init__(self, vision_tower, args, delay_load=False):
-        super().__init__()
-        self.is_loaded = False
-        self.vision_tower_name = vision_tower
-        self.select_layer = args.mm_vision_select_layer
-        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
-        if not delay_load:
-            self.load_model()
-        else:
-            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
-    def load_model(self):
-        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
-        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
-        self.vision_tower.requires_grad_(False)
-        self.is_loaded = True
-    def feature_select(self, image_forward_outs):
-        image_features = image_forward_outs.hidden_states[self.select_layer]
-        if self.select_feature == 'patch':
-            image_features = image_features[:, 1:]
-        elif self.select_feature == 'cls_patch':
-            image_features = image_features
-        else:
-            raise ValueError(f'Unexpected select feature: {self.select_feature}')
-        return image_features
-    @torch.no_grad()
-    def forward(self, images):
-        if type(images) is list:
-            image_features = []
-            for image in images:
-                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
-                image_feature = self.feature_select(image_forward_out).to(image.dtype)
-                image_features.append(image_feature)
-        else:
-            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
-            image_features = self.feature_select(image_forward_outs).to(images.dtype)
-        return image_features
-    @property
-    def dummy_feature(self):
-        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
-    @property
-    def dtype(self):
-        return self.vision_tower.dtype
-    @property
-    def device(self):
-        return self.vision_tower.device
-    @property
-    def config(self):
-        if self.is_loaded:
-            return self.vision_tower.config
-        else:
-            return self.cfg_only
-    @property
-    def hidden_size(self):
-        return self.config.hidden_size
-class ClipTowerCfg:
-    def __init__(self):
-        self.mm_vision_select_feature = 'patch'
-        self.mm_vision_select_layer =  -2
-clip = CLIPVisionTower("openai/clip-vit-large-patch14-336", ClipTowerCfg())
 clip.load_model()
 clip = clip.to(device=DEVICE, dtype=torch.bfloat16)
@@ -169,11 +99,11 @@ def gen_answer(model, tokenizer, clip, projection, query, special_embs, image=No
     with torch.no_grad():
         image_features = clip.image_processor(image, return_tensors='pt')
         image_embedding = clip(image_features['pixel_values']).to(device=DEVICE, dtype=torch.bfloat16)
         projected_vision_embeddings = projection(image_embedding).to(device=DEVICE, dtype=torch.bfloat16)
         prompt_ids = tokenizer.encode(f"{PROMPT}", add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
         question_ids = tokenizer.encode(query, add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
         prompt_embeddings = model.model.embed_tokens(prompt_ids).to(torch.bfloat16)
         question_embeddings = model.model.embed_tokens(question_ids).to(torch.bfloat16)
@@ -200,11 +130,11 @@ img = Image.open(urlopen(img_url))
 answer = gen_answer(
     model,
-    tokenizer,
-    clip,
-    projection,
-    query=question,
-    special_embs=special_embs,
     image=img
 )

 from transformers import AutoTokenizer, AutoModelForCausalLM
 from urllib.request import urlopen
 import torch.nn as nn
+from huggingface_hub import hf_hub_download
+# Loading some sources of the projection adapter and image encoder
+hf_hub_download(repo_id="AIRI-Institute/OmniFusion", filename="models.py", local_dir='./')
+from models import CLIPVisionTower
 DEVICE = "cuda:0"
 PROMPT = "This is a dialog with AI assistant.\n"
+tokenizer = AutoTokenizer.from_pretrained("AIRI-Institute/OmniFusion", subfolder="OmniMistral-tokenizer", use_fast=False)
+model = AutoModelForCausalLM.from_pretrained("AIRI-Institute/OmniFusion", subfolder="OmniMistral-model", torch_dtype=torch.bfloat16, device_map=DEVICE)
+hf_hub_download(repo_id="AIRI-Institute/OmniFusion", filename="projection", local_dir='./')
+hf_hub_download(repo_id="AIRI-Institute/OmniFusion", filename="special_embeddings.pt", local_dir='./')
 projection = torch.load("projection", map_location=DEVICE)
 special_embs = torch.load("special_embeddings.pt", map_location=DEVICE)
+clip = CLIPVisionTower("openai/clip-vit-large-patch14-336")
 clip.load_model()
 clip = clip.to(device=DEVICE, dtype=torch.bfloat16)
     with torch.no_grad():
         image_features = clip.image_processor(image, return_tensors='pt')
         image_embedding = clip(image_features['pixel_values']).to(device=DEVICE, dtype=torch.bfloat16)
         projected_vision_embeddings = projection(image_embedding).to(device=DEVICE, dtype=torch.bfloat16)
         prompt_ids = tokenizer.encode(f"{PROMPT}", add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
         question_ids = tokenizer.encode(query, add_special_tokens=False, return_tensors="pt").to(device=DEVICE)
         prompt_embeddings = model.model.embed_tokens(prompt_ids).to(torch.bfloat16)
         question_embeddings = model.model.embed_tokens(question_ids).to(torch.bfloat16)
 answer = gen_answer(
     model,
+    tokenizer,
+    clip,
+    projection,
+    query=question,
+    special_embs=special_embs,
     image=img
 )