Commit
•
57b4cf3
1
Parent(s):
e2bd36a
Upload LlavaOnevisionForConditionalGeneration
Browse files- README.md +3 -3
- config.json +3 -7
- model.safetensors +1 -1
README.md
CHANGED
@@ -2,15 +2,15 @@
|
|
2 |
language:
|
3 |
- en
|
4 |
- zh
|
5 |
-
pipeline_tag: image-text-to-text
|
6 |
-
inference: false
|
7 |
-
arxiv: 2408.03326
|
8 |
license: apache-2.0
|
9 |
tags:
|
10 |
- vision
|
11 |
- image-text-to-text
|
12 |
datasets:
|
13 |
- lmms-lab/LLaVA-OneVision-Data
|
|
|
|
|
|
|
14 |
---
|
15 |
# LLaVA-Onevision Model Card
|
16 |
|
|
|
2 |
language:
|
3 |
- en
|
4 |
- zh
|
|
|
|
|
|
|
5 |
license: apache-2.0
|
6 |
tags:
|
7 |
- vision
|
8 |
- image-text-to-text
|
9 |
datasets:
|
10 |
- lmms-lab/LLaVA-OneVision-Data
|
11 |
+
pipeline_tag: image-text-to-text
|
12 |
+
inference: false
|
13 |
+
arxiv: 2408.03326
|
14 |
---
|
15 |
# LLaVA-Onevision Model Card
|
16 |
|
config.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"_name_or_path": "/raid/raushan/ov-500",
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"ignore_index": -100,
|
7 |
"image_grid_pinpoints": [
|
@@ -151,7 +151,7 @@
|
|
151 |
]
|
152 |
],
|
153 |
"image_token_index": 151646,
|
154 |
-
"model_type": "
|
155 |
"projector_hidden_act": "gelu",
|
156 |
"text_config": {
|
157 |
"_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
|
@@ -162,30 +162,26 @@
|
|
162 |
"eos_token_id": 151645,
|
163 |
"hidden_size": 896,
|
164 |
"intermediate_size": 4864,
|
165 |
-
"max_position_embeddings": 32768,
|
166 |
"max_window_layers": 24,
|
167 |
"model_type": "qwen2",
|
168 |
"num_attention_heads": 14,
|
169 |
"num_hidden_layers": 24,
|
170 |
"num_key_value_heads": 2,
|
171 |
"rope_theta": 1000000.0,
|
172 |
-
"sliding_window": null,
|
173 |
"tie_word_embeddings": true,
|
174 |
"torch_dtype": "bfloat16",
|
175 |
-
"use_sliding_window": false,
|
176 |
"vocab_size": 152000
|
177 |
},
|
178 |
"tie_word_embeddings": false,
|
179 |
"torch_dtype": "float16",
|
180 |
"transformers_version": "4.45.0.dev0",
|
181 |
"use_image_newline_parameter": true,
|
|
|
182 |
"vision_aspect_ratio": "anyres_max_9",
|
183 |
"vision_config": {
|
184 |
-
"hidden_act": "gelu_pytorch_tanh",
|
185 |
"hidden_size": 1152,
|
186 |
"image_size": 384,
|
187 |
"intermediate_size": 4304,
|
188 |
-
"layer_norm_eps": 1e-06,
|
189 |
"model_type": "siglip_vision_model",
|
190 |
"num_attention_heads": 16,
|
191 |
"num_hidden_layers": 26,
|
|
|
1 |
{
|
2 |
"_name_or_path": "/raid/raushan/ov-500",
|
3 |
"architectures": [
|
4 |
+
"LlavaOnevisionForConditionalGeneration"
|
5 |
],
|
6 |
"ignore_index": -100,
|
7 |
"image_grid_pinpoints": [
|
|
|
151 |
]
|
152 |
],
|
153 |
"image_token_index": 151646,
|
154 |
+
"model_type": "llava_onevision",
|
155 |
"projector_hidden_act": "gelu",
|
156 |
"text_config": {
|
157 |
"_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
|
|
|
162 |
"eos_token_id": 151645,
|
163 |
"hidden_size": 896,
|
164 |
"intermediate_size": 4864,
|
|
|
165 |
"max_window_layers": 24,
|
166 |
"model_type": "qwen2",
|
167 |
"num_attention_heads": 14,
|
168 |
"num_hidden_layers": 24,
|
169 |
"num_key_value_heads": 2,
|
170 |
"rope_theta": 1000000.0,
|
|
|
171 |
"tie_word_embeddings": true,
|
172 |
"torch_dtype": "bfloat16",
|
|
|
173 |
"vocab_size": 152000
|
174 |
},
|
175 |
"tie_word_embeddings": false,
|
176 |
"torch_dtype": "float16",
|
177 |
"transformers_version": "4.45.0.dev0",
|
178 |
"use_image_newline_parameter": true,
|
179 |
+
"video_token_index": 151647,
|
180 |
"vision_aspect_ratio": "anyres_max_9",
|
181 |
"vision_config": {
|
|
|
182 |
"hidden_size": 1152,
|
183 |
"image_size": 384,
|
184 |
"intermediate_size": 4304,
|
|
|
185 |
"model_type": "siglip_vision_model",
|
186 |
"num_attention_heads": 16,
|
187 |
"num_hidden_layers": 26,
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1787445680
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07b3362c3412de79baf2379e44e5b0b2a8f4b965ebebd11d7b5b3eb4450fe96e
|
3 |
size 1787445680
|