toshi456
/

llava-jp-1.3b-v1.1-llava-jp-instruct-108k

@@ -25,6 +25,16 @@ In the second phase, it was fine-tuned with LLaVA-JP-Instruct-108K.
 resources for more information: https://github.com/tosiyuki/LLaVA-JP/tree/main
 **Comparing VLMs**
 ## How to use the model
 **1. Download dependencies**
@@ -34,7 +44,6 @@ git clone https://github.com/tosiyuki/LLaVA-JP.git
 **2. Inference**
 ```python
-import requests
 import torch
 import transformers
 from PIL import Image
@@ -43,12 +52,11 @@ from transformers.generation.streamers import TextStreamer
 from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
 from llava.conversation import conv_templates, SeparatorStyle
 from llava.model.llava_gpt2 import LlavaGpt2ForCausalLM
-from llava.train.arguments_dataclass import ModelArguments, DataArguments, TrainingArguments
 from llava.train.dataset import tokenizer_image_token
 if __name__ == "__main__":
-    model_path = 'toshi456/llava-jp-1.3b-v1.1.1'
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch_dtype = torch.bfloat16 if device=="cuda" else torch.float32
@@ -93,7 +101,7 @@ if __name__ == "__main__":
     # create prompt
     # ユーザー: <image>\n{prompt}
-    prompt = "猫の隣には何がありますか？"
     inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
     conv.append_message(conv.roles[0], inp)
     conv.append_message(conv.roles[1], None)
@@ -115,18 +123,19 @@ if __name__ == "__main__":
     # predict
     with torch.inference_mode():
-        model.generate(
             inputs=input_ids,
             images=image_tensor,
-            do_sample=True,
-            temperature=0.1,
             top_p=1.0,
             max_new_tokens=256,
             streamer=streamer,
             use_cache=True,
         )
-    """猫の隣にはノートパソコンがあります。"""
 ```
 ## Training dataset

 resources for more information: https://github.com/tosiyuki/LLaVA-JP/tree/main
 **Comparing VLMs**
+|Model|JA-VG-VQA-500<br>(ROUGE-L)|JA-VLM-Bench-In-the-Wild<br>(ROUGE-L)|Heron-Bench(Detail)|Heron-Bench(Conv)|Heron-Bench(Complex)|Heron-Bench(Average)
+|-|-|-|-|-|-|-|
+|[Japanese Stable VLM](https://huggingface.co/stabilityai/japanese-stable-vlm)|-|40.50|25.15|51.23|37.84|38.07|
+|[EvoVLM-JP-v1-7B](https://huggingface.co/SakanaAI/EvoVLM-JP-v1-7B)|**19.70**|**51.25**|50.31|44.42|40.47|45.07|
+|[Heron BLIP Japanese StableLM Base 7B llava-620k](https://huggingface.co/turing-motors/heron-chat-blip-ja-stablelm-base-7b-v1-llava-620k)|14.51|33.26|49.09|41.51|45.72|45.44|
+|[Heron GIT Japanese StableLM Base 7B](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v1)|15.18|37.82|42.77|**54.20**|43.53|46.83|
+|[llava-jp-1.3b-v1.1](https://huggingface.co/toshi456/llava-jp-1.3b-v1.1)|13.33|44.40|50.00|51.83|**48.98**|**50.39**|
+|[llava-jp-1.3b-v1.1-llava-jp-instruct-108k](https://huggingface.co/toshi456/llava-jp-1.3b-v1.1-llava-jp-instruct-108k)|-|17.07|**50.60**|45.31|33.24|41.52|
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/630af71ffaaea618ebc973db/SIXXIqwp-voffOXKZouqb.png)
 ## How to use the model
 **1. Download dependencies**
 **2. Inference**
 ```python
 import torch
 import transformers
 from PIL import Image
 from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
 from llava.conversation import conv_templates, SeparatorStyle
 from llava.model.llava_gpt2 import LlavaGpt2ForCausalLM
 from llava.train.dataset import tokenizer_image_token
 if __name__ == "__main__":
+    model_path = 'toshi456/llava-jp-1.3b-v1.1-llava-jp-instruct-108k'
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch_dtype = torch.bfloat16 if device=="cuda" else torch.float32
     # create prompt
     # ユーザー: <image>\n{prompt}
+    prompt = "画像について説明してください。"
     inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
     conv.append_message(conv.roles[0], inp)
     conv.append_message(conv.roles[1], None)
     # predict
     with torch.inference_mode():
+        output_id = model.generate(
             inputs=input_ids,
             images=image_tensor,
+            do_sample=False,
+            temperature=1.0,
             top_p=1.0,
+            no_repeat_ngram_size=2,
             max_new_tokens=256,
             streamer=streamer,
             use_cache=True,
         )
+    """グレーの壁に置かれた木製のテーブルの上に、茶色のタビーの猫が横たわっている。猫は右を向いており、頭は左を向き、尻尾は体の前に突き出ているように見える。テーブルは木製で、猫の後ろには黒い金属製の脚があり、テーブルの下には小さな緑の植物が置かれる。<EOD|LLM-jp>"""
 ```
 ## Training dataset