bidiptas
/

PG-InstructBLIP

Image-to-Text

English

vision

image-captioning

Model card Files Files and versions Community

bidiptas commited on Sep 20, 2023

Commit

ee6345a

•

1 Parent(s): 1c7ae7d

Example generation scripts

Browse files

Files changed (3) hide show

README.md +14 -3
generate.py +140 -0
test.py +38 -0

README.md CHANGED Viewed

@@ -22,6 +22,8 @@ This model is designed to be used with the LAVIS library. Please install [salesf
 After loading the model, you can disable the qformer text input to follow the same configuration we used for fine-tuning. However, the model still works well with it enabled, so we recommend users to experiment with both and choose the optimal configuration on a case-by-case basis.
 ```
 import torch
 from PIL import Image
@@ -32,13 +34,15 @@ from lavis.common.registry import registry
 import requests
 url = "https://iliad.stanford.edu/pg-vlm/example_images/ceramic_bowl.jpg"
 example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 vlm = load_model(
     name='blip2_t5_instruct',
     model_type='flant5xxl',
-    checkpoint='pg-vlm/pgvlm_weights.bin',  # replace with location of downloaded weights
     is_eval=True,
     device="cuda" if torch.cuda.is_available() else "cpu"
 )
@@ -56,6 +60,13 @@ question_samples = {
     'image': torch.stack([processor(example_image)], dim=0).to(vlm.device)
 }
-print(vlm.generate(question_samples, length_penalty=0, repetition_penalty=1, num_captions=3))
 # (['opaque', 'translucent', 'transparent'], tensor([-0.0448, -4.1387, -4.2793], device='cuda:0'))
-```

 After loading the model, you can disable the qformer text input to follow the same configuration we used for fine-tuning. However, the model still works well with it enabled, so we recommend users to experiment with both and choose the optimal configuration on a case-by-case basis.
+Review the generate.py and test.py scripts provided in the Files tab for an example of using PG-InstructBLIP to determine the transparency of an opaque bowl.
 ```
 import torch
 from PIL import Image
 import requests
+from generate import generate
 url = "https://iliad.stanford.edu/pg-vlm/example_images/ceramic_bowl.jpg"
 example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 vlm = load_model(
     name='blip2_t5_instruct',
     model_type='flant5xxl',
+    checkpoint='pgvlm_weights.bin',  # replace with location of downloaded weights
     is_eval=True,
     device="cuda" if torch.cuda.is_available() else "cpu"
 )
     'image': torch.stack([processor(example_image)], dim=0).to(vlm.device)
 }
+answers, scores = generate(vlm, question_samples, length_penalty=0, repetition_penalty=1, num_captions=3)
+print(answers, scores)
 # (['opaque', 'translucent', 'transparent'], tensor([-0.0448, -4.1387, -4.2793], device='cuda:0'))
+```
+Note that the output of the generate function includes the log probabilities of each generation. For categorical properties (like material, transparency, and contents), these probabilities can be interpreted as confidences, as typical with VLMs. In the example above, PG-InstructBLIP is very confident that the ceramic bowl is opaque, which is true.
+For continuous properties (like mass, fragility, and deformability), we recommend asking yes or no questions like "Is this object heavy?" and comparing the probabilities of the "yes" response between objects to determine which has a larger value.
+For best results, we also recommend cropping input images to focus on the object in question, because PG-InstructBLIP is fine-tuned on object-centric data.

generate.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+@torch.no_grad()
+def generate(
+    vlm,
+    samples,
+    use_nucleus_sampling=False,
+    num_beams=5,
+    max_length=256,
+    min_length=1,
+    top_p=0.9,
+    repetition_penalty=1.5,
+    length_penalty=1.0,
+    num_captions=1,
+    temperature=1,
+):
+    if "prompt" in samples.keys():
+        prompt = samples["prompt"]
+    else:
+        prompt = vlm.prompt
+    image = samples["image"]
+    bs = image.size(0)
+    if isinstance(prompt, str):
+        prompt = [prompt] * bs
+    else:
+        assert len(prompt) == bs, "The number of prompts must be equal to the batch size."
+    # For TextCaps
+    if "ocr_tokens" in samples.keys() and "{}" in prompt[0]:
+        prompt = [p.format(', '.join(samples['ocr_tokens'][i][:30])) for i, p in enumerate(prompt)]
+    query_tokens = vlm.query_tokens.expand(bs, -1, -1)
+    if vlm.qformer_text_input:
+        # remove ocr tokens in q_former (for eval textvqa)
+        # qformer_prompt = prompt
+        # qformer_prompt = ['Question: ' + qp.split(' Question: ')[1] for qp in qformer_prompt]
+        text_Qformer = vlm.tokenizer(
+            prompt,
+            padding='longest',
+            truncation=True,
+            max_length=vlm.max_txt_len,
+            return_tensors="pt",
+        ).to(image.device)
+        query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device)
+        Qformer_atts = torch.cat([query_atts,text_Qformer.attention_mask],dim=1)
+    # For video data
+    if image.dim() == 5:
+        inputs_t5, atts_t5 = [], []
+        for j in range(image.size(2)):
+            this_frame = image[:,:,j,:,:]
+            with vlm.maybe_autocast():
+                frame_embeds = vlm.ln_vision(vlm.visual_encoder(this_frame))
+                frame_atts = torch.ones(frame_embeds.size()[:-1], dtype=torch.long).to(image.device)
+            if vlm.qformer_text_input:
+                frame_query_output = vlm.Qformer.bert(
+                    text_Qformer.input_ids,
+                    attention_mask = Qformer_atts,
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=frame_embeds,
+                    encoder_attention_mask=frame_atts,
+                    return_dict=True,
+                )
+            else:
+                frame_query_output = vlm.Qformer.bert(
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=frame_embeds,
+                    encoder_attention_mask=frame_atts,
+                    return_dict=True,
+                )
+            frame_inputs_t5 = vlm.t5_proj(frame_query_output.last_hidden_state[:,:query_tokens.size(1),:])
+            frame_atts_t5 = torch.ones(frame_inputs_t5.size()[:-1], dtype=torch.long).to(image.device)
+            inputs_t5.append(frame_inputs_t5)
+            atts_t5.append(frame_atts_t5)
+        inputs_t5 = torch.cat(inputs_t5, dim=1)
+        atts_t5 = torch.cat(atts_t5, dim=1)
+    else:
+        with vlm.maybe_autocast():
+            image_embeds = vlm.ln_vision(vlm.visual_encoder(image))
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+        if vlm.qformer_text_input:
+            query_output = vlm.Qformer.bert(
+                text_Qformer.input_ids,
+                attention_mask=Qformer_atts,
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+        else:
+            query_output = vlm.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+        inputs_t5 = vlm.t5_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:])
+        atts_t5 = torch.ones(inputs_t5.size()[:-1], dtype=torch.long).to(image.device)
+    input_tokens = vlm.t5_tokenizer(
+        prompt,
+        padding="longest",
+        return_tensors="pt"
+    ).to(image.device)
+    encoder_atts = torch.cat([atts_t5, input_tokens.attention_mask], dim=1)
+    with vlm.maybe_autocast(dtype=torch.bfloat16):
+        inputs_embeds = vlm.t5_model.encoder.embed_tokens(input_tokens.input_ids)
+        inputs_embeds = torch.cat([inputs_t5, inputs_embeds], dim=1)
+        outputs = vlm.t5_model.generate(
+            return_dict_in_generate=True,
+            output_scores=True,
+            inputs_embeds=inputs_embeds,
+            attention_mask=encoder_atts,
+            do_sample=use_nucleus_sampling,
+            top_p=top_p,
+            temperature=temperature,
+            num_beams=num_beams,
+            max_new_tokens=max_length,
+            min_length=min_length,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            num_return_sequences=num_captions,
+        )
+        output_text = vlm.t5_tokenizer.batch_decode(
+            outputs.sequences, skip_special_tokens=True
+        )
+    return output_text, outputs.sequences_scores

test.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+from PIL import Image
+from omegaconf import OmegaConf
+from lavis.models import load_model, load_preprocess
+from lavis.common.registry import registry
+import requests
+from generate import generate
+url = "https://iliad.stanford.edu/pg-vlm/example_images/ceramic_bowl.jpg"
+example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+vlm = load_model(
+    name='blip2_t5_instruct',
+    model_type='flant5xxl',
+    checkpoint='pgvlm_weights.bin',  # replace with location of downloaded weights
+    is_eval=True,
+    device="cuda" if torch.cuda.is_available() else "cpu"
+)
+vlm.qformer_text_input = False  # Optionally disable qformer text
+model_cls = registry.get_model_class('blip2_t5_instruct')
+model_type = 'flant5xxl'
+preprocess_cfg = OmegaConf.load(model_cls.default_config_path(model_type)).preprocess
+vis_processors, _ = load_preprocess(preprocess_cfg)
+processor = vis_processors["eval"]
+question_samples = {
+    'prompt': 'Question: Classify this object as transparent, translucent, or opaque? Respond unknown if you are not sure. Short answer:',
+    'image': torch.stack([processor(example_image)], dim=0).to(vlm.device)
+}
+answers, scores = generate(vlm, question_samples, length_penalty=0, repetition_penalty=1, num_captions=3)
+print(answers, scores)
+# (['opaque', 'translucent', 'transparent'], tensor([-0.0448, -4.1387, -4.2793], device='cuda:0'))