llava-hf
/

llava-1.5-13b-hf

@@ -4,6 +4,10 @@ language:
 pipeline_tag: image-to-text
 inference: false
 arxiv: 2304.08485
 ---
 # LLaVA Model Card
@@ -43,10 +47,23 @@ import requests
 model_id = "llava-hf/llava-1.5-13b-hf"
 pipe = pipeline("image-to-text", model=model_id)
-url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
-prompt = "USER: <image>\nWhat does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud\nASSISTANT:"
 outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
 print(outputs)
@@ -65,10 +82,6 @@ import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 model_id = "llava-hf/llava-1.5-13b-hf"
-prompt = "USER: <image>\nWhat are these?\nASSISTANT:"
-image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 model = LlavaForConditionalGeneration.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
@@ -77,6 +90,21 @@ model = LlavaForConditionalGeneration.from_pretrained(
 processor = AutoProcessor.from_pretrained(model_id)
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
 inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)

 pipeline_tag: image-to-text
 inference: false
 arxiv: 2304.08485
+license: llama2
+tags:
+- vision
+- image-text-to-text
 ---
 # LLaVA Model Card
 model_id = "llava-hf/llava-1.5-13b-hf"
 pipe = pipeline("image-to-text", model=model_id)
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
 print(outputs)
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 model_id = "llava-hf/llava-1.5-13b-hf"
 model = LlavaForConditionalGeneration.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
 processor = AutoProcessor.from_pretrained(model_id)
+# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
+# Each value in "content" has to be a list of dicts with types ("text", "image")
+conversation = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "What are these?"},
+          {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
 inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)