Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

added_tokens.json +5 -0
config.json +37 -0
configuration_doubutsu.py +15 -0
merges.txt +0 -0
modeling_doubutsu.py +139 -0
pytorch_model.bin +3 -0
special_tokens_map.json +20 -0
tokenizer.json +0 -0
tokenizer_config.json +43 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "auto_map": {
+    "AutoConfig": "configuration_doubutsu.DoubutsuConfig",
+    "AutoModelForCausalLM": "modeling_doubutsu.Doubutsu"
+  },
+  "model_type": "doubutsu",
+  "text_config": {
+    "_name_or_path": "Qwen/Qwen2-1.5B-Instruct",
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_size": 1536,
+    "intermediate_size": 8960,
+    "max_length": 32768,
+    "model_type": "qwen2",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 2,
+    "rope_theta": 1000000.0,
+    "sliding_window": 32768,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16"
+  },
+  "transformers_version": "4.40.1",
+  "vision_config": {
+    "_name_or_path": "google/siglip-so400m-patch14-384",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "patch_size": 14
+  }
+}

configuration_doubutsu.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transformers import PretrainedConfig, Qwen2Config, SiglipVisionConfig
+class DoubutsuConfig(PretrainedConfig):
+    model_type = "doubutsu"
+    def __init__(self, **kwargs):
+        self.text_config = Qwen2Config(
+            **kwargs.pop(
+                "text_config",
+                {},
+            ),
+        )
+        self.vision_config = SiglipVisionConfig(**kwargs.pop("vision_config", {}))
+        super().__init__(**kwargs)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_doubutsu.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+import torch.nn as nn
+from transformers import (
+    PreTrainedModel,
+    AutoModelForCausalLM,
+    AutoModel,
+    SiglipImageProcessor,
+)
+from .configuration_doubutsu import DoubutsuConfig
+class ProjectionModule(nn.Module):
+    def __init__(self, mm_hidden_size=1152, hidden_size=1536):
+        super(ProjectionModule, self).__init__()
+        self.model = nn.Sequential(
+            nn.Linear(mm_hidden_size, hidden_size),
+            nn.GELU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+    def forward(self, x):
+        return self.model(x)
+class Doubutsu(PreTrainedModel):
+    config_class = DoubutsuConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.vision_model = AutoModel.from_config(self.config.vision_config)
+        self.text_model = AutoModelForCausalLM.from_config(self.config.text_config)
+        self.processor = SiglipImageProcessor()
+        self.mm_projector = ProjectionModule(
+            mm_hidden_size=config.vision_config.hidden_size,
+            hidden_size=config.text_config.hidden_size,
+        )
+    @property
+    def device(self):
+        return self.text_model.device
+    def encode_image(self, image):
+        image = image.convert("RGB")
+        image = self.processor(
+            images=image,
+            return_tensors="pt",
+            do_resize=True,
+            size={"height": 378, "width": 378},
+        )["pixel_values"].to(
+            device=self.vision_model.device, dtype=self.vision_model.dtype
+        )
+        with torch.no_grad():
+            return self.vision_model(image, output_hidden_states=True).hidden_states[-2]
+    def input_embeds(self, prompt, image_embeds, tokenizer):
+        def _tokenize(txt):
+            return tokenizer(
+                txt, return_tensors="pt", add_special_tokens=False
+            ).input_ids.to(self.device)
+        text_emb = self.text_model.get_input_embeddings()
+        embeds = []
+        tokenized_prompt = _tokenize(prompt)
+        # Add BOS token if it exists and isn't already at the start of the prompt
+        if tokenizer.bos_token_id is not None:
+            if tokenized_prompt[0][0] == tokenizer.bos_token_id:
+                tokenized_prompt = tokenized_prompt[:, 1:]  # Remove existing BOS
+            embeds.append(
+                text_emb(torch.tensor([[tokenizer.bos_token_id]], device=self.device))
+            )
+        # Add image embeds
+        projected_image_embeds = self.mm_projector(image_embeds.to(self.device))
+        embeds.append(projected_image_embeds)
+        # Add text embeds
+        embeds.append(text_emb(tokenized_prompt))
+        return torch.cat(embeds, dim=1)
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+    def generate(
+        self,
+        image_embeds,
+        prompt,
+        tokenizer,
+        max_new_tokens=128,
+        temperature=0.1,
+        **kwargs,
+    ):
+        generate_config = {
+            "eos_token_id": tokenizer.eos_token_id,
+            "bos_token_id": tokenizer.bos_token_id,
+            "pad_token_id": tokenizer.pad_token_id,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            **kwargs,
+        }
+        with torch.no_grad():
+            inputs_embeds = self.input_embeds(prompt, image_embeds, tokenizer)
+            output_ids = self.text_model.generate(
+                inputs_embeds=inputs_embeds,
+                do_sample=True,
+                **generate_config,
+            )
+        return tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    def answer_question(self, image, question, tokenizer, **kwargs):
+        image_embeds = self.encode_image(image)
+        chat = [
+            {
+                "role": "system",
+                "content": "You are a helpful AI assistant that can see images and answer questions about them.",
+            },
+            {"role": "user", "content": question},
+        ]
+        prompt = tokenizer.apply_chat_template(
+            chat, tokenize=False, add_generation_prompt=True
+        )
+        # Generate the answer
+        with torch.no_grad():
+            output = self.generate(
+                image_embeds=image_embeds,
+                prompt=prompt,
+                tokenizer=tokenizer,
+                **kwargs,
+            )[0]
+        # Clean and return the answer
+        cleaned_answer = output.strip()
+        return cleaned_answer

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c10024a70443cf96a47827579df1f55adcdaef649c9e9c1dc33481f64573cb44
+size 3952463074

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff