init

Browse files

Files changed (6) hide show

README.md +43 -0
__main__.py +204 -0
assets/demo-1.jpg +0 -0
assets/demo-2.jpg +0 -0
assets/demo-3.jpg +0 -0
mm_projector.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+language:
+- en
+---
+# llama3-vision-alpha
+projection module trained to add vision capabilties to Llama 3 using SigLIP. built by [@yeswondwerr](https://x.com/yeswondwerr) and [@qtnx_](https://x.com/qtnx_)
+**usage**
+```
+pip install torch transformers bitsandbytes accelerate
+```
+```
+python __main__.py -i image
+```
+**examples**
+| Image | Examples |
+| --- | --- |
+| ![](assets/demo-1.jpg) | **What is the title of this book? answer briefly**<br>The title of the book is "The Little Book of Deep Learning".<br><br>**Where is the person standing? answer briefly**<br> The person is standing on the balcony. |
+| ![](assets/demo-2.jpg) | **What type of food is the girl holding? answer briefly**<br>A hamburger!<br><br>**What color is the woman's hair? answer briefly**<br>It's white! |
+```
+                                       .x+=:.
+                                      z`    ^%                                                  .uef^"
+               .u    .                   .   <k                           .u    .             :d88E
+    .u@u     .d88B :@8c       .u       .@8Ned8"      .u          u      .d88B :@8c        .   `888E
+ .zWF8888bx ="8888f8888r   ud8888.   .@^%8888"    ud8888.     us888u.  ="8888f8888r  .udR88N   888E .z8k
+.888  9888    4888>'88"  :888'8888. x88:  `)8b. :888'8888. .@88 "8888"   4888>'88"  <888'888k  888E~?888L
+I888  9888    4888> '    d888 '88%" 8888N=*8888 d888 '88%" 9888  9888    4888> '    9888 'Y"   888E  888E
+I888  9888    4888>      8888.+"     %8"    R88 8888.+"    9888  9888    4888>      9888       888E  888E
+I888  9888   .d888L .+   8888L        @8Wou 9%  8888L      9888  9888   .d888L .+   9888       888E  888E
+`888Nx?888   ^"8888*"    '8888c. .+ .888888P`   '8888c. .+ 9888  9888   ^"8888*"    ?8888u../  888E  888E
+ "88" '888      "Y"       "88888%   `   ^"F      "88888%   "888*""888"     "Y"       "8888P'  m888N= 888>
+       88E                  "YP'                   "YP'     ^Y"   ^Y'                  "P'     `Y"   888
+       98>                                                                                          J88"
+       '8                                                                                           @%
+        `                                                                                         :"
+```

__main__.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import argparse
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import (
+    AutoModel,
+    AutoProcessor,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    LlamaForCausalLM, SiglipImageProcessor, SiglipVisionModel
+)
+from transformers import TextStreamer
+def tokenizer_image_token(prompt, tokenizer, image_token_index=-200, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    return torch.tensor(input_ids, dtype=torch.long)
+def process_tensors(input_ids, image_features, embedding_layer):
+    # Find the index of -200 in input_ids
+    split_index = (input_ids == -200).nonzero(as_tuple=True)[1][0]
+    # Split the input_ids at the index found, excluding -200
+    input_ids_1 = input_ids[:, :split_index]
+    input_ids_2 = input_ids[:, split_index + 1:]
+    # Convert input_ids to embeddings
+    embeddings_1 = embedding_layer(input_ids_1)
+    embeddings_2 = embedding_layer(input_ids_2)
+    device = image_features.device
+    token_embeddings_part1 = embeddings_1.to(device)
+    token_embeddings_part2 = embeddings_2.to(device)
+    # Concatenate the token embeddings and image features
+    concatenated_embeddings = torch.cat(
+        [token_embeddings_part1, image_features, token_embeddings_part2], dim=1
+    )
+    # Create the corrected attention mask
+    attention_mask = torch.ones(concatenated_embeddings.shape[:2], dtype=torch.long, device=device)
+    return concatenated_embeddings, attention_mask
+def initialize_models():
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
+    )
+    tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct", use_fast=True)
+    model = LlamaForCausalLM.from_pretrained(
+        "unsloth/llama-3-8b-Instruct",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        quantization_config=bnb_config,
+    )
+    for param in model.base_model.parameters():
+        param.requires_grad = False
+    model_name = "google/siglip-so400m-patch14-384"
+    vision_model = SiglipVisionModel.from_pretrained(model_name, torch_dtype=torch.float16)
+    processor = SiglipImageProcessor.from_pretrained(model_name)
+    vision_model = vision_model.to("cuda")
+    return tokenizer, model, vision_model, processor
+class ProjectionModule(nn.Module):
+    def __init__(self, mm_hidden_size, hidden_size):
+        super(ProjectionModule, self).__init__()
+        # Directly set up the sequential model
+        self.model = nn.Sequential(
+            nn.Linear(mm_hidden_size, hidden_size),
+            nn.GELU(),
+            nn.Linear(hidden_size, hidden_size)
+        )
+    def forward(self, x):
+        return self.model(x)
+def load_projection_module(mm_hidden_size=1152, hidden_size=4096, device='cuda'):
+    projection_module = ProjectionModule(mm_hidden_size, hidden_size)
+    checkpoint = torch.load("./checkpoints/llama-3/checkpoint-2400/mm_projector.bin")
+    checkpoint = {k.replace("mm_projector.", ""): v for k, v in checkpoint.items()}
+    projection_module.load_state_dict(checkpoint)
+    projection_module = projection_module.to(device).half()
+    return projection_module
+def answer_question(
+        image_path, tokenizer, model, vision_model, processor, projection_module
+):
+    image = Image.open(image_path).convert('RGB')
+    tokenizer.bos_token_id = None
+    tokenizer.eos_token = "<|eot_id|>"
+    try:
+        inp = input('user: ')
+    except EOFError:
+        inp = ""
+    if not inp:
+        print("exit...")
+    question = '<image>' + inp
+    prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    input_ids = tokenizer_image_token(prompt, tokenizer, -200, return_tensors='pt').unsqueeze(0).to(
+        model.device)
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    with torch.inference_mode():
+        image_inputs = processor(images=[image], return_tensors="pt", do_resize=True,
+                                          size={"height": 384, "width": 384}).to("cuda")
+        image_inputs = image_inputs['pixel_values'].squeeze(0)
+        image_forward_outs = vision_model(image_inputs.to(device='cuda', dtype=torch.float16).unsqueeze(0),
+                                               output_hidden_states=True)
+        image_features = image_forward_outs.hidden_states[-2]
+        image_features2 = image_features[:, 1:]
+        projected_embeddings = projection_module(image_features2).to("cuda")
+        embedding_layer = model.get_input_embeddings()
+        #text_embeddings = embedding_layer(input_ids)
+        new_embeds, attn_mask = process_tensors(input_ids, projected_embeddings, embedding_layer)
+        device = model.device
+        attn_mask = attn_mask.to(device)
+        new_embeds = new_embeds.to(device)
+        model_kwargs = {
+            'do_sample': True,
+            'temperature': 0.2,
+            'max_new_tokens': 2000,
+            'use_cache': True,
+            'streamer': streamer
+        }
+        while True:
+            generated_ids = model.generate(
+                inputs_embeds=new_embeds,
+                attention_mask=attn_mask,
+                **model_kwargs
+            )[0]
+            generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
+            try:
+                inp = input('user: ')
+            except EOFError:
+                inp = ""
+            if not inp:
+                print("exit...")
+            new_text = generated_text + "<|start_header_id|>user<|end_header_id|>\n\n" + inp + "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            new_input_ids = tokenizer(new_text, return_tensors='pt').input_ids.to(device)
+            new_embeddings = embedding_layer(new_input_ids)
+            new_embeds = torch.cat([new_embeds, new_embeddings], dim=1)
+            attn_mask = torch.ones(new_embeds.shape[:2], device=device)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Answer questions based on an image")
+    parser.add_argument("-i", "--image", required=True, help="Path to the image file")
+    args = parser.parse_args()
+    tokenizer, model, vision_model, processor = initialize_models()
+    projection_module = load_projection_module()
+    answer_question(
+        args.image,
+        tokenizer,
+        model,
+        vision_model,
+        processor,
+        projection_module,
+    )

assets/demo-1.jpg ADDED Viewed

assets/demo-2.jpg ADDED Viewed

assets/demo-3.jpg ADDED Viewed

mm_projector.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c67486e883bf7f02b9756850c6f1914e7146936b49805bd3ca8583a71c4d40f
+size 43009661