benchang1110 commited on
Commit
a83e261
1 Parent(s): 002067f

Upload TaiVisionForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +6 -1
  2. model.safetensors +1 -1
  3. modeling_taivisionlm.py +9 -40
config.json CHANGED
@@ -1,6 +1,10 @@
1
  {
 
 
 
2
  "auto_map": {
3
- "AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig"
 
4
  },
5
  "hidden_size": 2048,
6
  "ignore_index": -100,
@@ -21,6 +25,7 @@
21
  "torch_dtype": "bfloat16",
22
  "vocab_size": 32001
23
  },
 
24
  "transformers_version": "4.44.0",
25
  "vision_config": {
26
  "model_type": "siglip_vision_model",
 
1
  {
2
+ "architectures": [
3
+ "TaiVisionForCausalLM"
4
+ ],
5
  "auto_map": {
6
+ "AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig",
7
+ "AutoModelForCausalLM": "modeling_taivisionlm.TaiVisionForCausalLM"
8
  },
9
  "hidden_size": 2048,
10
  "ignore_index": -100,
 
25
  "torch_dtype": "bfloat16",
26
  "vocab_size": 32001
27
  },
28
+ "torch_dtype": "float32",
29
  "transformers_version": "4.44.0",
30
  "vision_config": {
31
  "model_type": "siglip_vision_model",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e3c91245701f070448659cda849d90ef35ea419ad8aa53c459b20a7d516df00
3
  size 4806424752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d50e45bc0203fb3be9a06add95e21a024690098db67cd7b97f29ae03c2bb57
3
  size 4806424752
modeling_taivisionlm.py CHANGED
@@ -156,18 +156,17 @@ class TaiVisionForCausalLM(TaiVisionPreTrainedModel):
156
  self.language_model = language_model
157
  self.post_init()
158
 
159
- def load_pretrained(self):
160
- '''
161
- load the pretrained weights for language model and vision model
162
- '''
163
- import transformers
164
- language_model = AutoModelForCausalLM.from_pretrained("benchang1110/Taiwan-tinyllama-v1.0-chat")
165
  if language_model.vocab_size != self.vocab_size:
166
  print("vocab size mismatch, resize the token embeddings for the pretained language model")
167
  language_model.resize_token_embeddings(self.vocab_size)
168
- self.language_model = language_model
169
- vision_model = transformers.SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
170
- self.vision_tower = vision_model
 
 
 
171
 
172
  # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_input_embeddings with PaliGemma->TaiVisionLM
173
  def get_input_embeddings(self):
@@ -439,34 +438,4 @@ class TaiVisionForCausalLM(TaiVisionPreTrainedModel):
439
  if cache_position[0] == 0:
440
  model_inputs["pixel_values"] = pixel_values
441
 
442
- return model_inputs
443
-
444
-
445
-
446
- if __name__ == '__main__':
447
- import transformers
448
- config = transformers.AutoConfig.from_pretrained("benchang1110/TaiVision-base",trust_remote_code=True)
449
- model = TaiVisionForCausalLM(config).to("cuda")
450
- print(model)
451
- model.save_pretrained
452
- # Test forward
453
- import torch
454
- from PIL import Image
455
- import requests
456
- # Initialize processor
457
- processor = transformers.AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
458
-
459
- # Load image
460
- url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
461
- image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
462
-
463
- # Define prompt and label
464
- prompt = "What is the color of the car?"
465
- label = "I am fine, thank you."
466
-
467
- # Process inputs
468
- inputs = processor(prompts=prompt,images=image, return_tensors="pt", padding=False, max_length=512).to('cuda')
469
-
470
- outputs = model.generate(**inputs, max_length=512, do_sample=True, pad_token_id=processor.tokenizer.pad_token_id)
471
- print(processor.decode(outputs[0], skip_special_tokens=True))
472
-
 
156
  self.language_model = language_model
157
  self.post_init()
158
 
159
+ def load_language_model(self, model_id = "benchang1110/Taiwan-tinyllama-v1.0-chat"):
160
+ language_model = AutoModelForCausalLM.from_pretrained(model_id)
 
 
 
 
161
  if language_model.vocab_size != self.vocab_size:
162
  print("vocab size mismatch, resize the token embeddings for the pretained language model")
163
  language_model.resize_token_embeddings(self.vocab_size)
164
+ self.language_model.load_state_dict(language_model.state_dict(),strict=True)
165
+
166
+ def load_vision_model(self,model_id = "google/siglip-base-patch16-224"):
167
+ import transformers
168
+ vision_model = transformers.SiglipVisionModel.from_pretrained(model_id)
169
+ self.vision_tower.load_state_dict(vision_model.state_dict(),strict=True)
170
 
171
  # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_input_embeddings with PaliGemma->TaiVisionLM
172
  def get_input_embeddings(self):
 
438
  if cache_position[0] == 0:
439
  model_inputs["pixel_values"] = pixel_values
440
 
441
+ return model_inputs