Spaces:

austinmw
/

instructblip-vicuna-13b-4bit-image-qa

Runtime error

App Files Files Community

austinmw commited on Jul 18, 2023

Commit

298d752

•

1 Parent(s): 611e172

Upload tool

Browse files

Files changed (4) hide show

app.py +4 -0
blip_tool.py +69 -0
requirements.txt +2 -0
tool_config.json +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers import launch_gradio_demo
+from blip_tool import InstructBLIPImageQuestionAnsweringTool
+launch_gradio_demo(InstructBLIPImageQuestionAnsweringTool)

blip_tool.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
+from transformers.tools import PipelineTool
+from transformers.tools.base import get_default_device
+from transformers.utils import requires_backends
+class InstructBLIPImageQuestionAnsweringTool(PipelineTool):
+    #default_checkpoint = "Salesforce/blip2-opt-2.7b"
+    #default_checkpoint = "Salesforce/instructblip-flan-t5-xl"
+    #default_checkpoint = "Salesforce/instructblip-vicuna-7b"
+    default_checkpoint = "Salesforce/instructblip-vicuna-13b"
+    description = (
+        "This is a tool that answers a question about an image. It takes an input named `image` which should be the "
+        "image containing the information, as well as a `question` which should be the question in English. It "
+        "returns a text that is the answer to the question."
+    )
+    name = "image_qa"
+    pre_processor_class = AutoProcessor
+    model_class = AutoModelForVision2Seq
+    inputs = ["image", "text"]
+    outputs = ["text"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+        super().__init__(*args, **kwargs)
+    def setup(self):
+        """
+        Instantiates the `pre_processor`, `model` and `post_processor` if necessary.
+        """
+        if isinstance(self.pre_processor, str):
+            self.pre_processor = self.pre_processor_class.from_pretrained(self.pre_processor, **self.hub_kwargs)
+        if isinstance(self.model, str):
+            self.model = self.model_class.from_pretrained(self.model, **self.model_kwargs, **self.hub_kwargs, load_in_4bit=True, torch_dtype=torch.float16)
+        if self.post_processor is None:
+            self.post_processor = self.pre_processor
+        elif isinstance(self.post_processor, str):
+            self.post_processor = self.post_processor_class.from_pretrained(self.post_processor, **self.hub_kwargs)
+        if self.device is None:
+            if self.device_map is not None:
+                self.device = list(self.model.hf_device_map.values())[0]
+            else:
+                self.device = get_default_device()
+        self.is_initialized = True
+    def encode(self, image, question: str):
+        return self.pre_processor(images=image, text=question, return_tensors="pt").to(device="cuda", dtype=torch.float16)
+    def forward(self, inputs):
+        outputs = self.model.generate(
+            **inputs,
+            num_beams=5,
+            max_new_tokens=256,
+            min_length=1,
+            top_p=0.9,
+            repetition_penalty=1.5,
+            length_penalty=1.0,
+            temperature=0.7,
+        )
+        return outputs
+    def decode(self, outputs):
+        return self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ torch

tool_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "description": "This is a tool that answers a question about an image. It takes an input named `image` which should be the image containing the information, as well as a `question` which should be the question in English. It returns a text that is the answer to the question.",
+  "name": "image_qa",
+  "tool_class": "blip_tool.InstructBLIPImageQuestionAnsweringTool"
+}