Spaces:

TencentARC
/

Caption-Anything

Runtime error

App Files Files Community

ttengwang commited on Apr 11, 2023

Commit

10240e0

•

1 Parent(s): 36b57c3

update lastest

Browse files

Files changed (11) hide show

.gitignore +135 -0
README.md +45 -11
app.py +28 -23
app_huggingface.py +268 -0
app_old.py +5 -5
caption_anything.py +114 -0
captioner/base_captioner.py +3 -2
env.sh +1 -1
image_editing_utils.py +2 -2
requirements.txt +1 -0
tools.py +7 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,135 @@

+result/
+model_cache/
+*.pth
+teng_grad_start.sh
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+result/
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

README.md CHANGED Viewed

@@ -1,13 +1,47 @@
 ---
-title: Caption Anything
-emoji: 📚
-colorFrom: green
-colorTo: green
-sdk: gradio
-sdk_version: 3.24.1
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Caption-Anything
+<!-- ![](./Image/title.svg) -->
+**Caption-Anything** is a versatile image processing tool that combines the capabilities of [Segment Anything](https://github.com/facebookresearch/segment-anything), Visual Captioning, and [ChatGPT](https://openai.com/blog/chatgpt). Our solution generates descriptive captions for any object within an image, offering a range of language styles to accommodate diverse user preferences. **Caption-Anything** supports visual controls (mouse click) and language controls (length, sentiment, factuality, and language).
+* visual controls and language controls for text generation
+* Chat about selected object for detailed understanding
+* Interactive demo
+![](./Image/UI.png)
+<!-- <a src="https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-blue" href="https://huggingface.co/spaces/wybertwang/Caption-Anything">
+    <img src="https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-blue" alt="Open in Spaces">
+</a> -->
+<!-- <a src="https://colab.research.google.com/assets/colab-badge.svg" href="">
+    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab">
+</a> -->
+### Demo
+Explore the interactive demo of Caption-Anything, which showcases its powerful capabilities in generating captions for various objects within an image. The demo allows users to control visual aspects by clicking on objects, as well as to adjust textual properties such as length, sentiment, factuality, and language.
+![](./Image/demo1.png)
 ---
+![](./Image/demo2.png)
+### Getting Started
+* Clone the repository:
+```bash
+git clone https://github.com/ttengwang/caption-anything.git
+```
+* Install dependencies:
+```bash
+cd caption-anything
+pip install -r requirements.txt
+```
+* Download the [SAM checkpoints](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth) and place it to `./segmenter/sam_vit_h_4b8939.pth.`
+* Run the Caption-Anything gradio demo.
+```bash
+# Configure the necessary ChatGPT APIs
+export OPENAI_API_KEY={Your_Private_Openai_Key}
+python app.py --regular_box  --captioner blip2 --port 6086
+```
+## Acknowledgement
+The project is based on [Segment Anything](https://github.com/facebookresearch/segment-anything), BLIP/BLIP-2, [ChatGPT](https://openai.com/blog/chatgpt). Thanks for the authors for their efforts.

app.py CHANGED Viewed

@@ -2,12 +2,12 @@ from io import BytesIO
 import string
 import gradio as gr
 import requests
-from caas import CaptionAnything
 import torch
 import json
 import sys
 import argparse
-from caas import parse_augment
 import numpy as np
 import PIL.ImageDraw as ImageDraw
 from image_editing_utils import create_bubble_frame
@@ -47,6 +47,9 @@ examples = [
 ]
 args = parse_augment()
 # args.device = 'cuda:5'
 # args.disable_gpt = False
 # args.enable_reduce_tokens = True
@@ -81,9 +84,9 @@ def chat_with_points(chat_input, click_state, state):
         return state, state
     points, labels, captions = click_state
-    point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\nNow begin chatting! Human: {chat_input}\nAI: "
-    # "The image is of width {width} and height {height}."
     prev_visual_context = ""
     pos_points = [f"{points[i][0]}, {points[i][1]}" for i in range(len(points)) if labels[i] == 1]
     if len(captions):
@@ -114,9 +117,10 @@ def inference_seg_cap(image_input, point_prompt, language, sentiment, factuality
     out = model.inference(image_input, prompt, controls)
     state = state + [(None, "Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]))]
-    for k, v in out['generated_captions'].items():
-        state = state + [(f'{k}: {v}', None)]
     click_state[2].append(out['generated_captions']['raw_caption'])
     text = out['generated_captions']['raw_caption']
@@ -127,12 +131,13 @@ def inference_seg_cap(image_input, point_prompt, language, sentiment, factuality
     origin_image_input = image_input
     image_input = create_bubble_frame(image_input, text, (evt.index[0], evt.index[1]))
-    yield state, state, click_state, chat_input, image_input
     if not args.disable_gpt and hasattr(model, "text_refiner"):
         refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'])
-        new_cap = 'Original: ' + text + '. Refined: ' + refined_caption['caption']
         refined_image_input = create_bubble_frame(origin_image_input, new_cap, (evt.index[0], evt.index[1]))
-        yield state, state, click_state, chat_input, refined_image_input
 def upload_callback(image_input, state):
@@ -195,28 +200,29 @@ with gr.Blocks(
         with gr.Column(scale=0.5):
             openai_api_key = gr.Textbox(
                 placeholder="Input your openAI API key and press Enter",
-                show_label=True,
                 label = "OpenAI API Key",
                 lines=1,
                 type="password"
                 )
             openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key])
-            chatbot = gr.Chatbot(label="Chat about Selected Object",).style(height=620,scale=0.5)
             chat_input = gr.Textbox(lines=1, label="Chat Input")
             with gr.Row():
                 clear_button_text = gr.Button(value="Clear Text", interactive=True)
                 submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
     clear_button_clike.click(
-        lambda x: ([[], [], []], x),
         [origin_image],
-        [click_state, image_input],
         queue=False,
         show_progress=False
     )
     clear_button_image.click(
-        lambda: (None, [], [], [[], [], []]),
         [],
-        [image_input, chatbot, state, click_state],
         queue=False,
         show_progress=False
     )
@@ -228,9 +234,9 @@ with gr.Blocks(
         show_progress=False
     )
     image_input.clear(
-        lambda: (None, [], [], [[], [], []]),
         [],
-        [image_input, chatbot, state, click_state],
         queue=False,
         show_progress=False
     )
@@ -255,9 +261,8 @@ with gr.Blocks(
         state,
         click_state
         ],
-    outputs=[chatbot, state, click_state, chat_input, image_input],
-    show_progress=False, queue=True)
 iface.queue(concurrency_count=5, api_open=False, max_size=10)
-iface.launch(server_name="0.0.0.0", enable_queue=True)

 import string
 import gradio as gr
 import requests
+from caption_anything import CaptionAnything
 import torch
 import json
 import sys
 import argparse
+from caption_anything import parse_augment
 import numpy as np
 import PIL.ImageDraw as ImageDraw
 from image_editing_utils import create_bubble_frame
 ]
 args = parse_augment()
+args.captioner = 'blip2'
+args.seg_crop_mode = 'wo_bg'
+args.regular_box = True
 # args.device = 'cuda:5'
 # args.disable_gpt = False
 # args.enable_reduce_tokens = True
         return state, state
     points, labels, captions = click_state
+    # point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\nNow begin chatting! Human: {chat_input}\nAI: "
+    # # "The image is of width {width} and height {height}."
+    point_chat_prompt = "a) Revised prompt: I am an AI trained to chat with you about an image based on specific points (w, h) you provide, along with their visual descriptions. Please note that (0, 0) refers to the top-left corner of the image, w refers to the width, and h refers to the height. Here are the points and their descriptions you've given me: {points_with_caps}. Now, let's chat! Human: {chat_input} AI:"
     prev_visual_context = ""
     pos_points = [f"{points[i][0]}, {points[i][1]}" for i in range(len(points)) if labels[i] == 1]
     if len(captions):
     out = model.inference(image_input, prompt, controls)
     state = state + [(None, "Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]))]
+    # for k, v in out['generated_captions'].items():
+    #     state = state + [(f'{k}: {v}', None)]
+    state = state + [("caption: {}".format(out['generated_captions']['raw_caption']), None)]
+    wiki = out['generated_captions'].get('wiki', "")
     click_state[2].append(out['generated_captions']['raw_caption'])
     text = out['generated_captions']['raw_caption']
     origin_image_input = image_input
     image_input = create_bubble_frame(image_input, text, (evt.index[0], evt.index[1]))
+    yield state, state, click_state, chat_input, image_input, wiki
     if not args.disable_gpt and hasattr(model, "text_refiner"):
         refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'])
+        # new_cap = 'Original: ' + text + '. Refined: ' + refined_caption['caption']
+        new_cap = refined_caption['caption']
         refined_image_input = create_bubble_frame(origin_image_input, new_cap, (evt.index[0], evt.index[1]))
+        yield state, state, click_state, chat_input, refined_image_input, wiki
 def upload_callback(image_input, state):
         with gr.Column(scale=0.5):
             openai_api_key = gr.Textbox(
                 placeholder="Input your openAI API key and press Enter",
+                show_label=False,
                 label = "OpenAI API Key",
                 lines=1,
                 type="password"
                 )
             openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key])
+            wiki_output = gr.Textbox(lines=6, label="Wiki")
+            chatbot = gr.Chatbot(label="Chat about Selected Object",).style(height=450,scale=0.5)
             chat_input = gr.Textbox(lines=1, label="Chat Input")
             with gr.Row():
                 clear_button_text = gr.Button(value="Clear Text", interactive=True)
                 submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
     clear_button_clike.click(
+        lambda x: ([[], [], []], x, ""),
         [origin_image],
+        [click_state, image_input, wiki_output],
         queue=False,
         show_progress=False
     )
     clear_button_image.click(
+        lambda: (None, [], [], [[], [], []], ""),
         [],
+        [image_input, chatbot, state, click_state, wiki_output],
         queue=False,
         show_progress=False
     )
         show_progress=False
     )
     image_input.clear(
+        lambda: (None, [], [], [[], [], []], ""),
         [],
+        [image_input, chatbot, state, click_state, wiki_output],
         queue=False,
         show_progress=False
     )
         state,
         click_state
         ],
+        outputs=[chatbot, state, click_state, chat_input, image_input, wiki_output],
+        show_progress=False, queue=True)
 iface.queue(concurrency_count=5, api_open=False, max_size=10)
+iface.launch(server_name="0.0.0.0", enable_queue=True)

app_huggingface.py ADDED Viewed

	@@ -0,0 +1,268 @@

+from io import BytesIO
+import string
+import gradio as gr
+import requests
+from caption_anything import CaptionAnything
+import torch
+import json
+import sys
+import argparse
+from caption_anything import parse_augment
+import numpy as np
+import PIL.ImageDraw as ImageDraw
+from image_editing_utils import create_bubble_frame
+import copy
+from tools import mask_painter
+from PIL import Image
+import os
+def download_checkpoint(url, folder, filename):
+    os.makedirs(folder, exist_ok=True)
+    filepath = os.path.join(folder, filename)
+    if not os.path.exists(filepath):
+        response = requests.get(url, stream=True)
+        with open(filepath, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+    return filepath
+checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
+folder = "segmenter"
+filename = "sam_vit_h_4b8939.pth"
+download_checkpoint(checkpoint_url, folder, filename)
+title = """<h1 align="center">Caption-Anything</h1>"""
+description = """Gradio demo for Caption Anything, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. Code: https://github.com/ttengwang/Caption-Anything
+"""
+examples = [
+    ["test_img/img2.jpg"],
+    ["test_img/img5.jpg"],
+    ["test_img/img12.jpg"],
+    ["test_img/img14.jpg"],
+]
+args = parse_augment()
+args.captioner = 'blip2'
+args.seg_crop_mode = 'wo_bg'
+args.regular_box = True
+# args.device = 'cuda:5'
+# args.disable_gpt = False
+# args.enable_reduce_tokens = True
+# args.port=20322
+model = CaptionAnything(args)
+def init_openai_api_key(api_key):
+    os.environ['OPENAI_API_KEY'] = api_key
+    model.init_refiner()
+def get_prompt(chat_input, click_state):
+    points = click_state[0]
+    labels = click_state[1]
+    inputs = json.loads(chat_input)
+    for input in inputs:
+        points.append(input[:2])
+        labels.append(input[2])
+    prompt = {
+        "prompt_type":["click"],
+        "input_point":points,
+        "input_label":labels,
+        "multimask_output":"True",
+    }
+    return prompt
+def chat_with_points(chat_input, click_state, state):
+    if not hasattr(model, "text_refiner"):
+        response = "Text refiner is not initilzed, please input openai api key."
+        state = state + [(chat_input, response)]
+        return state, state
+    points, labels, captions = click_state
+    # point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\nNow begin chatting! Human: {chat_input}\nAI: "
+    # # "The image is of width {width} and height {height}."
+    point_chat_prompt = "a) Revised prompt: I am an AI trained to chat with you about an image based on specific points (w, h) you provide, along with their visual descriptions. Please note that (0, 0) refers to the top-left corner of the image, w refers to the width, and h refers to the height. Here are the points and their descriptions you've given me: {points_with_caps}. Now, let's chat! Human: {chat_input} AI:"
+    prev_visual_context = ""
+    pos_points = [f"{points[i][0]}, {points[i][1]}" for i in range(len(points)) if labels[i] == 1]
+    if len(captions):
+        prev_visual_context = ', '.join(pos_points) + captions[-1] + '\n'
+    else:
+        prev_visual_context = 'no point exists.'
+    chat_prompt = point_chat_prompt.format(**{"points_with_caps": prev_visual_context, "chat_input": chat_input})
+    response = model.text_refiner.llm(chat_prompt)
+    state = state + [(chat_input, response)]
+    return state, state
+def inference_seg_cap(image_input, point_prompt, language, sentiment, factuality, length, state, click_state, evt:gr.SelectData):
+    if point_prompt == 'Positive':
+        coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
+    else:
+        coordinate = "[[{}, {}, 0]]".format(str(evt.index[0]), str(evt.index[1]))
+    controls = {'length': length,
+             'sentiment': sentiment,
+             'factuality': factuality,
+             'language': language}
+    # click_coordinate = "[[{}, {}, 1]]".format(str(evt.index[0]), str(evt.index[1]))
+    # chat_input = click_coordinate
+    prompt = get_prompt(coordinate, click_state)
+    print('prompt: ', prompt, 'controls: ', controls)
+    out = model.inference(image_input, prompt, controls)
+    state = state + [(None, "Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]))]
+    # for k, v in out['generated_captions'].items():
+    #     state = state + [(f'{k}: {v}', None)]
+    state = state + [("caption: {}".format(out['generated_captions']['raw_caption']), None)]
+    wiki = out['generated_captions'].get('wiki', "")
+    click_state[2].append(out['generated_captions']['raw_caption'])
+    text = out['generated_captions']['raw_caption']
+    # draw = ImageDraw.Draw(image_input)
+    # draw.text((evt.index[0], evt.index[1]), text, textcolor=(0,0,255), text_size=120)
+    input_mask = np.array(Image.open(out['mask_save_path']).convert('P'))
+    image_input = mask_painter(np.array(image_input), input_mask)
+    origin_image_input = image_input
+    image_input = create_bubble_frame(image_input, text, (evt.index[0], evt.index[1]))
+    yield state, state, click_state, chat_input, image_input, wiki
+    if not args.disable_gpt and hasattr(model, "text_refiner"):
+        refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'])
+        # new_cap = 'Original: ' + text + '. Refined: ' + refined_caption['caption']
+        new_cap = refined_caption['caption']
+        refined_image_input = create_bubble_frame(origin_image_input, new_cap, (evt.index[0], evt.index[1]))
+        yield state, state, click_state, chat_input, refined_image_input, wiki
+def upload_callback(image_input, state):
+    state = [] + [('Image size: ' + str(image_input.size), None)]
+    click_state = [[], [], []]
+    model.segmenter.image = None
+    model.segmenter.image_embedding = None
+    model.segmenter.set_image(image_input)
+    return state, image_input, click_state
+with gr.Blocks(
+    css='''
+    #image_upload{min-height:400px}
+    #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 600px}
+    '''
+) as iface:
+    state = gr.State([])
+    click_state = gr.State([[],[],[]])
+    origin_image = gr.State(None)
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column(scale=1.0):
+            image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
+            with gr.Row(scale=1.0):
+                point_prompt = gr.Radio(
+                    choices=["Positive",  "Negative"],
+                    value="Positive",
+                    label="Point Prompt",
+                    interactive=True)
+                clear_button_clike = gr.Button(value="Clear Clicks", interactive=True)
+                clear_button_image = gr.Button(value="Clear Image", interactive=True)
+            with gr.Row(scale=1.0):
+                language = gr.Dropdown(['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"], value="English", label="Language", interactive=True)
+                sentiment = gr.Radio(
+                    choices=["Positive", "Natural", "Negative"],
+                    value="Natural",
+                    label="Sentiment",
+                    interactive=True,
+                )
+            with gr.Row(scale=1.0):
+                factuality = gr.Radio(
+                    choices=["Factual", "Imagination"],
+                    value="Factual",
+                    label="Factuality",
+                    interactive=True,
+                )
+                length = gr.Slider(
+                    minimum=10,
+                    maximum=80,
+                    value=10,
+                    step=1,
+                    interactive=True,
+                    label="Length",
+                )
+        with gr.Column(scale=0.5):
+            openai_api_key = gr.Textbox(
+                placeholder="Input your openAI API key and press Enter",
+                show_label=False,
+                label = "OpenAI API Key",
+                lines=1,
+                type="password"
+                )
+            openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key])
+            wiki_output = gr.Textbox(lines=6, label="Wiki")
+            chatbot = gr.Chatbot(label="Chat about Selected Object",).style(height=450,scale=0.5)
+            chat_input = gr.Textbox(lines=1, label="Chat Input")
+            with gr.Row():
+                clear_button_text = gr.Button(value="Clear Text", interactive=True)
+                submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
+    clear_button_clike.click(
+        lambda x: ([[], [], []], x, ""),
+        [origin_image],
+        [click_state, image_input, wiki_output],
+        queue=False,
+        show_progress=False
+    )
+    clear_button_image.click(
+        lambda: (None, [], [], [[], [], []], ""),
+        [],
+        [image_input, chatbot, state, click_state, wiki_output],
+        queue=False,
+        show_progress=False
+    )
+    clear_button_text.click(
+        lambda: ([], [], [[], [], []]),
+        [],
+        [chatbot, state, click_state],
+        queue=False,
+        show_progress=False
+    )
+    image_input.clear(
+        lambda: (None, [], [], [[], [], []], ""),
+        [],
+        [image_input, chatbot, state, click_state, wiki_output],
+        queue=False,
+        show_progress=False
+    )
+    examples = gr.Examples(
+        examples=examples,
+        inputs=[image_input],
+    )
+    image_input.upload(upload_callback,[image_input, state], [state, origin_image, click_state])
+    chat_input.submit(chat_with_points, [chat_input, click_state, state], [chatbot, state])
+    # select coordinate
+    image_input.select(inference_seg_cap,
+        inputs=[
+        origin_image,
+        point_prompt,
+        language,
+        sentiment,
+        factuality,
+        length,
+        state,
+        click_state
+        ],
+        outputs=[chatbot, state, click_state, chat_input, image_input, wiki_output],
+        show_progress=False, queue=True)
+iface.queue(concurrency_count=1, api_open=False, max_size=10)
+iface.launch(server_name="0.0.0.0", enable_queue=True)

app_old.py CHANGED Viewed

@@ -2,12 +2,12 @@ from io import BytesIO
 import string
 import gradio as gr
 import requests
-from caas import CaptionAnything
 import torch
 import json
 import sys
 import argparse
-from caas import parse_augment
 import os
 # download sam checkpoint if not downloaded
@@ -83,12 +83,12 @@ def get_select_coords(image_input, point_prompt, language, sentiment, factuality
         else:
             coordinate = "[[{}, {}, 0]]".format(str(evt.index[0]), str(evt.index[1]))
         return (coordinate,) + inference_seg_cap(image_input, coordinate, language, sentiment, factuality, length, state, click_state)
 def chat_with_points(chat_input, click_state, state):
     points, labels, captions = click_state
-    point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\n. Now begin chatting! Human: {chat_input}\nAI: "
     # "The image is of width {width} and height {height}."
     prev_visual_context = ""
     pos_points = [f"{points[i][0]}, {points[i][1]}" for i in range(len(points)) if labels[i] == 1]
     prev_visual_context = ', '.join(pos_points) + captions[-1] + '\n'

 import string
 import gradio as gr
 import requests
+from caption_anything import CaptionAnything
 import torch
 import json
 import sys
 import argparse
+from caption_anything import parse_augment
 import os
 # download sam checkpoint if not downloaded
         else:
             coordinate = "[[{}, {}, 0]]".format(str(evt.index[0]), str(evt.index[1]))
         return (coordinate,) + inference_seg_cap(image_input, coordinate, language, sentiment, factuality, length, state, click_state)
 def chat_with_points(chat_input, click_state, state):
     points, labels, captions = click_state
+    # point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\n. Now begin chatting! Human: {chat_input}\nAI: "
     # "The image is of width {width} and height {height}."
+    point_chat_prompt = "a) Revised prompt: I am an AI trained to chat with you about an image based on specific points (w, h) you provide, along with their visual descriptions. Please note that (0, 0) refers to the top-left corner of the image, w refers to the width, and h refers to the height. Here are the points and their descriptions you've given me: {points_with_caps}. Now, let's chat! Human: {chat_input} AI:"
     prev_visual_context = ""
     pos_points = [f"{points[i][0]}, {points[i][1]}" for i in range(len(points)) if labels[i] == 1]
     prev_visual_context = ', '.join(pos_points) + captions[-1] + '\n'

caption_anything.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from captioner import build_captioner, BaseCaptioner
+from segmenter import build_segmenter
+from text_refiner import build_text_refiner
+import os
+import argparse
+import pdb
+import time
+from PIL import Image
+class CaptionAnything():
+    def __init__(self, args):
+        self.args = args
+        self.captioner = build_captioner(args.captioner, args.device, args)
+        self.segmenter = build_segmenter(args.segmenter, args.device, args)
+        if not args.disable_gpt:
+            self.init_refiner()
+    def init_refiner(self):
+        if os.environ.get('OPENAI_API_KEY', None):
+            self.text_refiner = build_text_refiner(self.args.text_refiner, self.args.device, self.args)
+    def inference(self, image, prompt, controls, disable_gpt=False):
+        #  segment with prompt
+        print("CA prompt: ", prompt, "CA controls",controls)
+        seg_mask = self.segmenter.inference(image, prompt)[0, ...]
+        mask_save_path = f'result/mask_{time.time()}.png'
+        if not os.path.exists(os.path.dirname(mask_save_path)):
+            os.makedirs(os.path.dirname(mask_save_path))
+        new_p = Image.fromarray(seg_mask.astype('int') * 255.)
+        if new_p.mode != 'RGB':
+            new_p = new_p.convert('RGB')
+        new_p.save(mask_save_path)
+        print('seg_mask path: ', mask_save_path)
+        print("seg_mask.shape: ", seg_mask.shape)
+        #  captioning with mask
+        if self.args.enable_reduce_tokens:
+            caption, crop_save_path = self.captioner.inference_with_reduced_tokens(image, seg_mask, crop_mode=self.args.seg_crop_mode, filter=self.args.clip_filter, regular_box = self.args.regular_box)
+        else:
+            caption, crop_save_path = self.captioner.inference_seg(image, seg_mask, crop_mode=self.args.seg_crop_mode, filter=self.args.clip_filter, regular_box = self.args.regular_box)
+        #  refining with TextRefiner
+        context_captions = []
+        if self.args.context_captions:
+            context_captions.append(self.captioner.inference(image))
+        if not disable_gpt and hasattr(self, "text_refiner"):
+            refined_caption = self.text_refiner.inference(query=caption, controls=controls, context=context_captions)
+        else:
+            refined_caption = {'raw_caption': caption}
+        out = {'generated_captions': refined_caption,
+            'crop_save_path': crop_save_path,
+            'mask_save_path': mask_save_path,
+            'context_captions': context_captions}
+        return out
+def parse_augment():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--captioner', type=str, default="blip")
+    parser.add_argument('--segmenter', type=str, default="base")
+    parser.add_argument('--text_refiner', type=str, default="base")
+    parser.add_argument('--segmenter_checkpoint', type=str, default="segmenter/sam_vit_h_4b8939.pth")
+    parser.add_argument('--seg_crop_mode', type=str, default="w_bg", choices=['wo_bg', 'w_bg'], help="whether to add or remove background of the image when captioning")
+    parser.add_argument('--clip_filter', action="store_true", help="use clip to filter bad captions")
+    parser.add_argument('--context_captions', action="store_true", help="use surrounding captions to enhance current caption (TODO)")
+    parser.add_argument('--regular_box', action="store_true", default = False, help="crop image with a regular box")
+    parser.add_argument('--device', type=str, default="cuda:0")
+    parser.add_argument('--port', type=int, default=6086, help="only useful when running gradio applications")
+    parser.add_argument('--debug', action="store_true")
+    parser.add_argument('--gradio_share', action="store_true")
+    parser.add_argument('--disable_gpt', action="store_true")
+    parser.add_argument('--enable_reduce_tokens', action="store_true", default=False)
+    parser.add_argument('--disable_reuse_features', action="store_true", default=False)
+    args = parser.parse_args()
+    if args.debug:
+        print(args)
+    return args
+if __name__ == "__main__":
+    args = parse_augment()
+    # image_path = 'test_img/img3.jpg'
+    image_path = 'test_img/img13.jpg'
+    prompts = [
+        {
+            "prompt_type":["click"],
+            "input_point":[[500, 300], [1000, 500]],
+            "input_label":[1, 0],
+            "multimask_output":"True",
+        },
+        {
+            "prompt_type":["click"],
+            "input_point":[[900, 800]],
+            "input_label":[1],
+            "multimask_output":"True",
+        }
+    ]
+    controls = {
+            "length": "30",
+            "sentiment": "positive",
+            # "imagination": "True",
+            "imagination": "False",
+            "language": "English",
+        }
+    model = CaptionAnything(args)
+    for prompt in prompts:
+        print('*'*30)
+        print('Image path: ', image_path)
+        image = Image.open(image_path)
+        print(image)
+        print('Visual controls (SAM prompt):\n', prompt)
+        print('Language controls:\n', controls)
+        out = model.inference(image_path, prompt, controls)

captioner/base_captioner.py CHANGED Viewed

@@ -146,7 +146,8 @@ class BaseCaptioner:
         seg_mask = np.array(seg_mask) > 0
         if crop_mode=="wo_bg":
-            image = np.array(image) * seg_mask[:,:,np.newaxis]
         else:
             image = np.array(image)
@@ -168,7 +169,7 @@ class BaseCaptioner:
         seg_mask = np.array(seg_mask) > 0
         if crop_mode=="wo_bg":
-            image = np.array(image) * seg_mask[:,:,np.newaxis]
         else:
             image = np.array(image)

         seg_mask = np.array(seg_mask) > 0
         if crop_mode=="wo_bg":
+            image = np.array(image) * seg_mask[:,:,np.newaxis] + (1 - seg_mask[:,:,np.newaxis]) * 255
+            image = np.uint8(image)
         else:
             image = np.array(image)
         seg_mask = np.array(seg_mask) > 0
         if crop_mode=="wo_bg":
+            image = np.array(image) * seg_mask[:,:,np.newaxis] + (1- seg_mask[:,:,np.newaxis]) * 255
         else:
             image = np.array(image)

env.sh CHANGED Viewed

@@ -1,6 +1,6 @@
 conda create -n caption_anything python=3.8 -y
 source activate caption_anything
-pip install -r requirement.txt
 cd segmenter
 wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

 conda create -n caption_anything python=3.8 -y
 source activate caption_anything
+pip install -r requirements.txt
 cd segmenter
 wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

image_editing_utils.py CHANGED Viewed

@@ -17,7 +17,7 @@ def wrap_text(text, font, max_width):
     lines.append(current_line)
     return lines
-def create_bubble_frame(image, text, point, font_path='DejaVuSansCondensed-Bold.ttf', font_size_ratio=0.033):
     # Load the image
     if type(image) == np.ndarray:
         image = Image.fromarray(image)
@@ -27,7 +27,7 @@ def create_bubble_frame(image, text, point, font_path='DejaVuSansCondensed-Bold.
     # Calculate max_text_width and font_size based on image dimensions and total number of characters
     total_chars = len(text)
-    max_text_width = int(0.33 * width)
     font_size = int(height * font_size_ratio)
     # Load the font

     lines.append(current_line)
     return lines
+def create_bubble_frame(image, text, point, font_path='DejaVuSansCondensed-Bold.ttf', font_size_ratio=0.025):
     # Load the image
     if type(image) == np.ndarray:
         image = Image.fromarray(image)
     # Calculate max_text_width and font_size based on image dimensions and total number of characters
     total_chars = len(text)
+    max_text_width = int(0.4 * width)
     font_size = int(height * font_size_ratio)
     # Load the font

requirements.txt CHANGED Viewed

@@ -16,3 +16,4 @@ matplotlib
 onnxruntime
 onnx
 https://gradio-builds.s3.amazonaws.com/3e68e5e882a6790ac5b457bd33f4edf9b695af90/gradio-3.24.1-py3-none-any.whl

 onnxruntime
 onnx
 https://gradio-builds.s3.amazonaws.com/3e68e5e882a6790ac5b457bd33f4edf9b695af90/gradio-3.24.1-py3-none-any.whl
+accelerate

tools.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import cv2
 import numpy as np
 from PIL import Image
 def colormap(rgb=True):
@@ -145,6 +146,11 @@ def mask_painter(input_image, input_mask, background_alpha=0.7, background_blur_
 	assert input_image.shape[:2] == input_mask.shape, 'different shape'
 	assert background_blur_radius % 2 * contour_width % 2 > 0, 'background_blur_radius and contour_width must be ODD'
 	# 0: background, 1: foreground
 	input_mask[input_mask>0] = 255
@@ -157,7 +163,7 @@ def mask_painter(input_image, input_mask, background_alpha=0.7, background_blur_
 	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (contour_width, contour_width))
 	contour_mask = cv2.dilate(contour_mask, kernel)
 	painted_image = vis_add_mask(painted_image, 255-contour_mask, color_list[contour_color], contour_alpha, contour_width)
 	return painted_image

 import cv2
 import numpy as np
 from PIL import Image
+import copy
 def colormap(rgb=True):
 	assert input_image.shape[:2] == input_mask.shape, 'different shape'
 	assert background_blur_radius % 2 * contour_width % 2 > 0, 'background_blur_radius and contour_width must be ODD'
+	width, height = input_image.shape[0], input_image.shape[1]
+	res = 1024
+	ratio = min(1.0 * res / max(width, height), 1.0)
+	input_image = cv2.resize(input_image, (int(height*ratio), int(width*ratio)))
+	input_mask = cv2.resize(input_mask, (int(height*ratio), int(width*ratio)))
 	# 0: background, 1: foreground
 	input_mask[input_mask>0] = 255
 	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (contour_width, contour_width))
 	contour_mask = cv2.dilate(contour_mask, kernel)
 	painted_image = vis_add_mask(painted_image, 255-contour_mask, color_list[contour_color], contour_alpha, contour_width)
+	painted_image = cv2.resize(painted_image, (height, width))
 	return painted_image