Spaces:

TencentARC
/

Caption-Anything

Runtime error

App Files Files Community

ttengwang commited on Apr 14, 2023

Commit

ff883a7

•

1 Parent(s): 73cef8f

update promtps for chating, add duplicate icon

Browse files

Files changed (6) hide show

app.py +88 -72
caption_anything.py +1 -1
captioner/base_captioner.py +5 -1
captioner/blip2.py +10 -4
segmenter/__init__.py +1 -4
tools.py +16 -11

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ from segment_anything import sam_model_registry
 from text_refiner import build_text_refiner
 from segmenter import build_segmenter
 def download_checkpoint(url, folder, filename):
     os.makedirs(folder, exist_ok=True)
     filepath = os.path.join(folder, filename)
@@ -32,16 +33,11 @@ def download_checkpoint(url, folder, filename):
                     f.write(chunk)
     return filepath
-checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
-folder = "segmenter"
-filename = "sam_vit_h_4b8939.pth"
-download_checkpoint(checkpoint_url, folder, filename)
-title = """<h1 align="center">Caption-Anything</h1>"""
-description = """Gradio demo for Caption Anything, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. Code: https://github.com/ttengwang/Caption-Anything
 """
 examples = [
     ["test_img/img35.webp"],
@@ -53,7 +49,26 @@ examples = [
     ["test_img/img1.jpg"],
 ]
 args = parse_augment()
 # args.device = 'cuda:5'
 # args.disable_gpt = True
 # args.enable_reduce_tokens = False
@@ -61,7 +76,7 @@ args = parse_augment()
 # args.captioner = 'blip'
 # args.regular_box = True
 shared_captioner = build_captioner(args.captioner, args.device, args)
-shared_sam_model = sam_model_registry['vit_h'](checkpoint=args.segmenter_checkpoint).to(args.device)
 def build_caption_anything_with_models(args, api_key="", captioner=None, sam_model=None, text_refiner=None, session_id=None):
@@ -102,7 +117,7 @@ def get_prompt(chat_input, click_state, click_mode):
         click_state[1] = labels
     else:
         raise NotImplementedError
     prompt = {
         "prompt_type":["click"],
         "input_point":click_state[0],
@@ -117,21 +132,21 @@ def update_click_state(click_state, caption, click_mode):
     elif click_mode == 'Single':
         click_state[2] = [caption]
     else:
-        raise NotImplementedError
-def chat_with_points(chat_input, click_state, chat_state, state, text_refiner):
     if text_refiner is None:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
         return state, state, chat_state
     points, labels, captions = click_state
     # point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\nNow begin chatting!"
     suffix = '\nHuman: {chat_input}\nAI: '
     qa_template = '\nHuman: {q}\nAI: {a}'
     # # "The image is of width {width} and height {height}."
-    point_chat_prompt = "I am an AI trained to chat with you about an image based on specific points (w, h) you provide, along with their visual descriptions. Please note that (0, 0) refers to the top-left corner of the image, w refers to the width, and h refers to the height. Here are the points and their descriptions you've given me: {points_with_caps} \n Now, let's chat!"
     prev_visual_context = ""
     pos_points = []
     pos_captions = []
@@ -139,8 +154,8 @@ def chat_with_points(chat_input, click_state, chat_state, state, text_refiner):
         if labels[i] == 1:
             pos_points.append(f"({points[i][0]}, {points[i][0]})")
             pos_captions.append(captions[i])
-            prev_visual_context = prev_visual_context + '\n' + 'Points: ' +', '.join(pos_points) + '. Description: ' + pos_captions[-1]
     context_length_thres = 500
     prev_history = ""
     for i in range(len(chat_state)):
@@ -149,26 +164,25 @@ def chat_with_points(chat_input, click_state, chat_state, state, text_refiner):
             prev_history = prev_history + qa_template.format(**{"q": q, "a": a})
         else:
             break
-    chat_prompt = point_chat_prompt.format(**{"points_with_caps": prev_visual_context}) + prev_history + suffix.format(**{"chat_input": chat_input})
     print('\nchat_prompt: ', chat_prompt)
     response = text_refiner.llm(chat_prompt)
     state = state + [(chat_input, response)]
     chat_state = chat_state + [(chat_input, response)]
     return state, state, chat_state
-def inference_seg_cap(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
-                length, image_embedding, state, click_state, original_size, input_size, text_refiner, evt:gr.SelectData):
     model = build_caption_anything_with_models(
-        args,
         api_key="",
         captioner=shared_captioner,
         sam_model=shared_sam_model,
         text_refiner=text_refiner,
         session_id=iface.app_id
     )
     model.segmenter.image_embedding = image_embedding
     model.segmenter.predictor.original_size = original_size
     model.segmenter.predictor.input_size = input_size
@@ -178,11 +192,11 @@ def inference_seg_cap(image_input, point_prompt, click_mode, enable_wiki, langua
         coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
     else:
         coordinate = "[[{}, {}, 0]]".format(str(evt.index[0]), str(evt.index[1]))
     controls = {'length': length,
-             'sentiment': sentiment,
-             'factuality': factuality,
-             'language': language}
     # click_coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
     # chat_input = click_coordinate
@@ -217,8 +231,7 @@ def inference_seg_cap(image_input, point_prompt, click_mode, enable_wiki, langua
         yield state, state, click_state, chat_input, refined_image_input, wiki
-def upload_callback(image_input, state):
-    state = [] + [(None, 'Image size: ' + str(image_input.size))]
     chat_state = []
     click_state = [[], [], []]
     res = 1024
@@ -227,9 +240,9 @@ def upload_callback(image_input, state):
     if ratio < 1.0:
         image_input = image_input.resize((int(width * ratio), int(height * ratio)))
         print('Scaling input image to {}'.format(image_input.size))
     model = build_caption_anything_with_models(
-        args,
         api_key="",
         captioner=shared_captioner,
         sam_model=shared_sam_model,
@@ -239,10 +252,11 @@ def upload_callback(image_input, state):
     image_embedding = model.segmenter.image_embedding
     original_size = model.segmenter.predictor.original_size
     input_size = model.segmenter.predictor.input_size
-    return state, state, chat_state, image_input, click_state, image_input, image_embedding, original_size, input_size
 with gr.Blocks(
-    css='''
     #image_upload{min-height:400px}
     #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 600px}
     '''
@@ -255,6 +269,7 @@ with gr.Blocks(
     text_refiner = gr.State(None)
     original_size = gr.State(None)
     input_size = gr.State(None)
     gr.Markdown(title)
     gr.Markdown(description)
@@ -281,13 +296,13 @@ with gr.Blocks(
                         clear_button_image = gr.Button(value="Clear Image", interactive=True)
             with gr.Column(visible=False) as modules_need_gpt:
                 with gr.Row(scale=1.0):
-                        language = gr.Dropdown(['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"], value="English", label="Language", interactive=True)
-                        sentiment = gr.Radio(
-                            choices=["Positive", "Natural", "Negative"],
-                            value="Natural",
-                            label="Sentiment",
-                            interactive=True,
-                        )
                 with gr.Row(scale=1.0):
                     factuality = gr.Radio(
                         choices=["Factual", "Imagination"],
@@ -304,10 +319,10 @@ with gr.Blocks(
                         label="Generated Caption Length",
                     )
                     enable_wiki = gr.Radio(
-                            choices=["Yes",  "No"],
-                            value="No",
-                            label="Enable Wiki",
-                            interactive=True)
             with gr.Column(visible=True) as modules_not_need_gpt3:
                 gr.Examples(
                     examples=examples,
@@ -332,11 +347,11 @@ with gr.Blocks(
                     with gr.Row():
                         clear_button_text = gr.Button(value="Clear Text", interactive=True)
                         submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
     openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
     enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
     disable_chatGPT_button.click(init_openai_api_key, outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
     clear_button_clike.click(
         lambda x: ([[], [], []], x, ""),
         [origin_image],
@@ -345,9 +360,9 @@ with gr.Blocks(
         show_progress=False
     )
     clear_button_image.click(
-        lambda: (None, [], [], [], [[], [], []], "", ""),
         [],
-        [image_input, chatbot, state, chat_state, click_state, wiki_output, origin_image],
         queue=False,
         show_progress=False
     )
@@ -359,37 +374,38 @@ with gr.Blocks(
         show_progress=False
     )
     image_input.clear(
-        lambda: (None, [], [], [], [[], [], []], "", ""),
         [],
-        [image_input, chatbot, state, chat_state, click_state, wiki_output, origin_image],
         queue=False,
         show_progress=False
     )
-    image_input.upload(upload_callback,[image_input, state], [chatbot, state, chat_state, origin_image, click_state, image_input, image_embedding, original_size, input_size])
-    chat_input.submit(chat_with_points, [chat_input, click_state, chat_state, state, text_refiner], [chatbot, state, chat_state])
-    example_image.change(upload_callback,[example_image, state], [chatbot, state, chat_state, origin_image, click_state, image_input, image_embedding, original_size, input_size])
     # select coordinate
-    image_input.select(inference_seg_cap,
-        inputs=[
-        origin_image,
-        point_prompt,
-        click_mode,
-        enable_wiki,
-        language,
-        sentiment,
-        factuality,
-        length,
-        image_embedding,
-        state,
-        click_state,
-        original_size,
-        input_size,
-        text_refiner
-        ],
-        outputs=[chatbot, state, click_state, chat_input, image_input, wiki_output],
-        show_progress=False, queue=True)
 iface.queue(concurrency_count=5, api_open=False, max_size=10)
 iface.launch(server_name="0.0.0.0", enable_queue=True)

 from text_refiner import build_text_refiner
 from segmenter import build_segmenter
 def download_checkpoint(url, folder, filename):
     os.makedirs(folder, exist_ok=True)
     filepath = os.path.join(folder, filename)
                     f.write(chunk)
     return filepath
+title = """<p><h1 align="center">Caption-Anything</h1></p>
 """
+description = """<p>Gradio demo for Caption Anything, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. Code: https://github.com/ttengwang/Caption-Anything <a href="https://huggingface.co/spaces/TencentARC/Caption-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>"""
 examples = [
     ["test_img/img35.webp"],
     ["test_img/img1.jpg"],
 ]
+seg_model_map = {
+    'base': 'vit_b',
+    'large': 'vit_l',
+    'huge': 'vit_h'
+}
+ckpt_url_map = {
+    'vit_b': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth',
+    'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
+    'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'
+}
 args = parse_augment()
+checkpoint_url = ckpt_url_map[seg_model_map[args.segmenter]]
+folder = "segmenter"
+filename = os.path.basename(checkpoint_url)
+args.segmenter_checkpoint = os.path.join(folder, filename)
+download_checkpoint(checkpoint_url, folder, filename)
 # args.device = 'cuda:5'
 # args.disable_gpt = True
 # args.enable_reduce_tokens = False
 # args.captioner = 'blip'
 # args.regular_box = True
 shared_captioner = build_captioner(args.captioner, args.device, args)
+shared_sam_model = sam_model_registry[seg_model_map[args.segmenter]](checkpoint=args.segmenter_checkpoint).to(args.device)
 def build_caption_anything_with_models(args, api_key="", captioner=None, sam_model=None, text_refiner=None, session_id=None):
         click_state[1] = labels
     else:
         raise NotImplementedError
     prompt = {
         "prompt_type":["click"],
         "input_point":click_state[0],
     elif click_mode == 'Single':
         click_state[2] = [caption]
     else:
+        raise NotImplementedError
+def chat_with_points(chat_input, click_state, chat_state, state, text_refiner, img_caption):
     if text_refiner is None:
         response = "Text refiner is not initilzed, please input openai api key."
         state = state + [(chat_input, response)]
         return state, state, chat_state
     points, labels, captions = click_state
     # point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\nNow begin chatting!"
     suffix = '\nHuman: {chat_input}\nAI: '
     qa_template = '\nHuman: {q}\nAI: {a}'
     # # "The image is of width {width} and height {height}."
+    point_chat_prompt = "I am an AI trained to chat with you about an image. I am greate at what is going on in any image based on the image information your provide. The overall image description is \"{img_caption}\". You will also provide me objects in the image in details, i.e., their location and visual descriptions. Here are the locations and descriptions of events that happen in the image: {points_with_caps} \n Now, let's chat!"
     prev_visual_context = ""
     pos_points = []
     pos_captions = []
         if labels[i] == 1:
             pos_points.append(f"({points[i][0]}, {points[i][0]})")
             pos_captions.append(captions[i])
+            prev_visual_context = prev_visual_context + '\n' + 'There is an event described as  \"{}\" locating at {}'.format(pos_captions[-1], ', '.join(pos_points))
     context_length_thres = 500
     prev_history = ""
     for i in range(len(chat_state)):
             prev_history = prev_history + qa_template.format(**{"q": q, "a": a})
         else:
             break
+    chat_prompt = point_chat_prompt.format(**{"img_caption":img_caption,"points_with_caps": prev_visual_context}) + prev_history + suffix.format(**{"chat_input": chat_input})
     print('\nchat_prompt: ', chat_prompt)
     response = text_refiner.llm(chat_prompt)
     state = state + [(chat_input, response)]
     chat_state = chat_state + [(chat_input, response)]
     return state, state, chat_state
+def inference_seg_cap(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
+                      length, image_embedding, state, click_state, original_size, input_size, text_refiner, evt:gr.SelectData):
     model = build_caption_anything_with_models(
+        args,
         api_key="",
         captioner=shared_captioner,
         sam_model=shared_sam_model,
         text_refiner=text_refiner,
         session_id=iface.app_id
     )
     model.segmenter.image_embedding = image_embedding
     model.segmenter.predictor.original_size = original_size
     model.segmenter.predictor.input_size = input_size
         coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
     else:
         coordinate = "[[{}, {}, 0]]".format(str(evt.index[0]), str(evt.index[1]))
     controls = {'length': length,
+                'sentiment': sentiment,
+                'factuality': factuality,
+                'language': language}
     # click_coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
     # chat_input = click_coordinate
         yield state, state, click_state, chat_input, refined_image_input, wiki
+def upload_callback(image_input, state):
     chat_state = []
     click_state = [[], [], []]
     res = 1024
     if ratio < 1.0:
         image_input = image_input.resize((int(width * ratio), int(height * ratio)))
         print('Scaling input image to {}'.format(image_input.size))
+    state = [] + [(None, 'Image size: ' + str(image_input.size))]
     model = build_caption_anything_with_models(
+        args,
         api_key="",
         captioner=shared_captioner,
         sam_model=shared_sam_model,
     image_embedding = model.segmenter.image_embedding
     original_size = model.segmenter.predictor.original_size
     input_size = model.segmenter.predictor.input_size
+    img_caption, _ = model.captioner.inference_seg(image_input)
+    return state, state, chat_state, image_input, click_state, image_input, image_embedding, original_size, input_size, img_caption
 with gr.Blocks(
+        css='''
     #image_upload{min-height:400px}
     #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 600px}
     '''
     text_refiner = gr.State(None)
     original_size = gr.State(None)
     input_size = gr.State(None)
+    img_caption = gr.State(None)
     gr.Markdown(title)
     gr.Markdown(description)
                         clear_button_image = gr.Button(value="Clear Image", interactive=True)
             with gr.Column(visible=False) as modules_need_gpt:
                 with gr.Row(scale=1.0):
+                    language = gr.Dropdown(['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"], value="English", label="Language", interactive=True)
+                    sentiment = gr.Radio(
+                        choices=["Positive", "Natural", "Negative"],
+                        value="Natural",
+                        label="Sentiment",
+                        interactive=True,
+                    )
                 with gr.Row(scale=1.0):
                     factuality = gr.Radio(
                         choices=["Factual", "Imagination"],
                         label="Generated Caption Length",
                     )
                     enable_wiki = gr.Radio(
+                        choices=["Yes",  "No"],
+                        value="No",
+                        label="Enable Wiki",
+                        interactive=True)
             with gr.Column(visible=True) as modules_not_need_gpt3:
                 gr.Examples(
                     examples=examples,
                     with gr.Row():
                         clear_button_text = gr.Button(value="Clear Text", interactive=True)
                         submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
     openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
     enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
     disable_chatGPT_button.click(init_openai_api_key, outputs=[modules_need_gpt,modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
     clear_button_clike.click(
         lambda x: ([[], [], []], x, ""),
         [origin_image],
         show_progress=False
     )
     clear_button_image.click(
+        lambda: (None, [], [], [], [[], [], []], "", "", ""),
         [],
+        [image_input, chatbot, state, chat_state, click_state, wiki_output, origin_image, img_caption],
         queue=False,
         show_progress=False
     )
         show_progress=False
     )
     image_input.clear(
+        lambda: (None, [], [], [], [[], [], []], "", "", ""),
         [],
+        [image_input, chatbot, state, chat_state, click_state, wiki_output, origin_image, img_caption],
         queue=False,
         show_progress=False
     )
+    image_input.upload(upload_callback,[image_input, state], [chatbot, state, chat_state, origin_image, click_state, image_input, image_embedding, original_size, input_size, img_caption])
+    chat_input.submit(chat_with_points, [chat_input, click_state, chat_state, state, text_refiner, img_caption], [chatbot, state, chat_state])
+    chat_input.submit(lambda: "", None, chat_input)
+    example_image.change(upload_callback,[example_image, state], [chatbot, state, chat_state, origin_image, click_state, image_input, image_embedding, original_size, input_size, img_caption])
     # select coordinate
+    image_input.select(inference_seg_cap,
+                       inputs=[
+                           origin_image,
+                           point_prompt,
+                           click_mode,
+                           enable_wiki,
+                           language,
+                           sentiment,
+                           factuality,
+                           length,
+                           image_embedding,
+                           state,
+                           click_state,
+                           original_size,
+                           input_size,
+                           text_refiner
+                       ],
+                       outputs=[chatbot, state, click_state, chat_input, image_input, wiki_output],
+                       show_progress=False, queue=True)
 iface.queue(concurrency_count=5, api_open=False, max_size=10)
 iface.launch(server_name="0.0.0.0", enable_queue=True)

caption_anything.py CHANGED Viewed

@@ -72,7 +72,7 @@ class CaptionAnything():
 def parse_augment():
     parser = argparse.ArgumentParser()
     parser.add_argument('--captioner', type=str, default="blip2")
-    parser.add_argument('--segmenter', type=str, default="base")
     parser.add_argument('--text_refiner', type=str, default="base")
     parser.add_argument('--segmenter_checkpoint', type=str, default="segmenter/sam_vit_h_4b8939.pth")
     parser.add_argument('--seg_crop_mode', type=str, default="wo_bg", choices=['wo_bg', 'w_bg'], help="whether to add or remove background of the image when captioning")

 def parse_augment():
     parser = argparse.ArgumentParser()
     parser.add_argument('--captioner', type=str, default="blip2")
+    parser.add_argument('--segmenter', type=str, default="huge")
     parser.add_argument('--text_refiner', type=str, default="base")
     parser.add_argument('--segmenter_checkpoint', type=str, default="segmenter/sam_vit_h_4b8939.pth")
     parser.add_argument('--seg_crop_mode', type=str, default="wo_bg", choices=['wo_bg', 'w_bg'], help="whether to add or remove background of the image when captioning")

captioner/base_captioner.py CHANGED Viewed

@@ -130,13 +130,17 @@ class BaseCaptioner:
         return caption, crop_save_path
-    def inference_seg(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str], crop_mode="w_bg", filter=False, disable_regular_box = False):
         if type(image) == str:
             image = Image.open(image)
         if type(seg_mask) == str:
             seg_mask = Image.open(seg_mask)
         elif type(seg_mask) == np.ndarray:
             seg_mask = Image.fromarray(seg_mask)
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0

         return caption, crop_save_path
+    def inference_seg(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str]=None, crop_mode="w_bg", filter=False, disable_regular_box = False):
+        if seg_mask is None:
+            seg_mask = np.ones(image.size).astype(bool)
         if type(image) == str:
             image = Image.open(image)
         if type(seg_mask) == str:
             seg_mask = Image.open(seg_mask)
         elif type(seg_mask) == np.ndarray:
             seg_mask = Image.fromarray(seg_mask)
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0

captioner/blip2.py CHANGED Viewed

@@ -6,6 +6,8 @@ import pdb
 import cv2
 import numpy as np
 from typing import Union
 from .base_captioner import BaseCaptioner
 class BLIP2Captioner(BaseCaptioner):
@@ -15,14 +17,18 @@ class BLIP2Captioner(BaseCaptioner):
         self.dialogue = dialogue
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map = 'sequential', load_in_8bit=True)
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
         if type(image) == str: # input path
-                image = Image.open(image)
         if not self.dialogue:
-            text_prompt = 'Context: ignore the white background in this image. Question: describe this image. Answer:'
             inputs = self.processor(image, text = text_prompt, return_tensors="pt").to(self.device, self.torch_dtype)
             out = self.model.generate(**inputs, max_new_tokens=50)
             captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
@@ -42,7 +48,7 @@ class BLIP2Captioner(BaseCaptioner):
                 out = self.model.generate(**inputs, max_new_tokens=50)
                 captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
                 context.append((input_texts, captions))
         return captions
 if __name__ == '__main__':

 import cv2
 import numpy as np
 from typing import Union
+from tools import is_platform_win
 from .base_captioner import BaseCaptioner
 class BLIP2Captioner(BaseCaptioner):
         self.dialogue = dialogue
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        if is_platform_win():
+            self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="sequential", torch_dtype=self.torch_dtype)
+        else:
+            self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map='sequential', load_in_8bit=True)
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
         if type(image) == str: # input path
+            image = Image.open(image)
         if not self.dialogue:
+            text_prompt = 'Question: what does the image show? Answer:'
             inputs = self.processor(image, text = text_prompt, return_tensors="pt").to(self.device, self.torch_dtype)
             out = self.model.generate(**inputs, max_new_tokens=50)
             captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
                 out = self.model.generate(**inputs, max_new_tokens=50)
                 captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
                 context.append((input_texts, captions))
         return captions
 if __name__ == '__main__':

segmenter/__init__.py CHANGED Viewed

@@ -2,7 +2,4 @@ from segmenter.base_segmenter import BaseSegmenter
 def build_segmenter(type, device, args=None, model=None):
-    if type == 'base':
-        return BaseSegmenter(device, args.segmenter_checkpoint, reuse_feature=not args.disable_reuse_features, model=model)
-    else:
-        raise NotImplementedError()


2
3
4	def build_segmenter(type, device, args=None, model=None):
5	+ return BaseSegmenter(device, args.segmenter_checkpoint, reuse_feature=not args.disable_reuse_features, model=model)

tools.py CHANGED Viewed

@@ -4,6 +4,11 @@ import numpy as np
 from PIL import Image
 import copy
 import time
 def colormap(rgb=True):
@@ -130,10 +135,10 @@ def vis_add_mask_wo_gaussian(image, background_mask, contour_mask, background_co
 	for i in range(3):
 		image[:, :, i] = image[:, :, i] * (1-background_alpha+background_mask*background_alpha) \
-			+ background_color[i] * (background_alpha-background_mask*background_alpha)
 		image[:, :, i] = image[:, :, i] * (1-contour_alpha+contour_mask*contour_alpha) \
-			+ contour_color[i] * (contour_alpha-contour_mask*contour_alpha)
 	return image.astype('uint8')
@@ -155,7 +160,7 @@ def mask_painter(input_image, input_mask, background_alpha=0.7, background_blur_
 	assert input_image.shape[:2] == input_mask.shape, 'different shape'
 	assert background_blur_radius % 2 * contour_width % 2 > 0, 'background_blur_radius and contour_width must be ODD'
 	# 0: background, 1: foreground
 	input_mask[input_mask>0] = 255
@@ -170,7 +175,7 @@ def mask_painter(input_image, input_mask, background_alpha=0.7, background_blur_
 	painted_image = vis_add_mask(painted_image, 255-contour_mask, color_list[contour_color], contour_alpha, contour_width)
 	# painted_image = background_dist_map
 	return painted_image
@@ -257,10 +262,10 @@ def mask_painter_wo_gaussian(input_image, input_mask, background_alpha=0.5, back
 	# downsample input image and mask
 	width, height = input_image.shape[0], input_image.shape[1]
 	res = 1024
-	ratio = min(1.0 * res / max(width, height), 1.0)
 	input_image = cv2.resize(input_image, (int(height*ratio), int(width*ratio)))
 	input_mask = cv2.resize(input_mask, (int(height*ratio), int(width*ratio)))
 	# 0: background, 1: foreground
 	msk = np.clip(input_mask, 0, 1)
@@ -271,14 +276,14 @@ def mask_painter_wo_gaussian(input_image, input_mask, background_alpha=0.5, back
 	background_mask, contour_mask = generator_dict[mode](msk, background_radius, contour_radius)
 	# paint
-	painted_image = vis_add_mask_wo_gaussian\
 		(input_image, background_mask, contour_mask, color_list[0], color_list[contour_color], background_alpha, contour_alpha)	# black for background
 	return painted_image
 if __name__ == '__main__':
 	background_alpha = 0.7  	# transparency of background 1: all black, 0: do nothing
 	background_blur_radius = 31	# radius of background blur, must be odd number
 	contour_width = 11       	# contour width, must be odd number
@@ -288,14 +293,14 @@ if __name__ == '__main__':
 	# load input image and mask
 	input_image = np.array(Image.open('./test_img/painter_input_image.jpg').convert('RGB'))
 	input_mask = np.array(Image.open('./test_img/painter_input_mask.jpg').convert('P'))
 	# paint
 	overall_time_1 = 0
 	overall_time_2 = 0
 	overall_time_3 = 0
 	overall_time_4 = 0
 	overall_time_5 = 0
 	for i in range(50):
 		t2 = time.time()
 		painted_image_00 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='00')

 from PIL import Image
 import copy
 import time
+import sys
+def is_platform_win():
+	return sys.platform == "win32"
 def colormap(rgb=True):
 	for i in range(3):
 		image[:, :, i] = image[:, :, i] * (1-background_alpha+background_mask*background_alpha) \
+						 + background_color[i] * (background_alpha-background_mask*background_alpha)
 		image[:, :, i] = image[:, :, i] * (1-contour_alpha+contour_mask*contour_alpha) \
+						 + contour_color[i] * (contour_alpha-contour_mask*contour_alpha)
 	return image.astype('uint8')
 	assert input_image.shape[:2] == input_mask.shape, 'different shape'
 	assert background_blur_radius % 2 * contour_width % 2 > 0, 'background_blur_radius and contour_width must be ODD'
 	# 0: background, 1: foreground
 	input_mask[input_mask>0] = 255
 	painted_image = vis_add_mask(painted_image, 255-contour_mask, color_list[contour_color], contour_alpha, contour_width)
 	# painted_image = background_dist_map
 	return painted_image
 	# downsample input image and mask
 	width, height = input_image.shape[0], input_image.shape[1]
 	res = 1024
+	ratio = min(1.0 * res / max(width, height), 1.0)
 	input_image = cv2.resize(input_image, (int(height*ratio), int(width*ratio)))
 	input_mask = cv2.resize(input_mask, (int(height*ratio), int(width*ratio)))
 	# 0: background, 1: foreground
 	msk = np.clip(input_mask, 0, 1)
 	background_mask, contour_mask = generator_dict[mode](msk, background_radius, contour_radius)
 	# paint
+	painted_image = vis_add_mask_wo_gaussian \
 		(input_image, background_mask, contour_mask, color_list[0], color_list[contour_color], background_alpha, contour_alpha)	# black for background
 	return painted_image
 if __name__ == '__main__':
 	background_alpha = 0.7  	# transparency of background 1: all black, 0: do nothing
 	background_blur_radius = 31	# radius of background blur, must be odd number
 	contour_width = 11       	# contour width, must be odd number
 	# load input image and mask
 	input_image = np.array(Image.open('./test_img/painter_input_image.jpg').convert('RGB'))
 	input_mask = np.array(Image.open('./test_img/painter_input_mask.jpg').convert('P'))
 	# paint
 	overall_time_1 = 0
 	overall_time_2 = 0
 	overall_time_3 = 0
 	overall_time_4 = 0
 	overall_time_5 = 0
 	for i in range(50):
 		t2 = time.time()
 		painted_image_00 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='00')