Spaces:

nielsr
/

comparing-captioning-models

Running

App Files Files Community

nielsr HF staff

hysts HF staff commited on Oct 23, 2023

Commit

c3bc979

•

1 Parent(s): 6a931bc

Use gradio_client (#49)

Browse files

- Use gradio_client (3cb35e062f030ebbc9622f243e198c7a19408c03)

Co-authored-by: hysts <[email protected]>

Files changed (4) hide show

README.md +1 -1
app.py +166 -81
requirements.txt +1 -5
style.css +4 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔥
 colorFrom: yellow
 colorTo: pink
 sdk: gradio
-sdk_version: 3.15.0
 app_file: app.py
 pinned: false
 ---

 colorFrom: yellow
 colorTo: pink
 sdk: gradio
+sdk_version: 3.50.2
 app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -1,83 +1,168 @@
-import gradio as gr
-from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration, VisionEncoderDecoderModel, InstructBlipForConditionalGeneration
-import torch
-import open_clip
-from huggingface_hub import hf_hub_download
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
-torch.hub.download_url_to_file('https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png', 'stop_sign.png')
-torch.hub.download_url_to_file('https://cdn.openai.com/dall-e-2/demos/text2im/astronaut/horse/photo/0.jpg', 'astronaut.jpg')
-git_processor_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco")
-git_model_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco").to(device)
-blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
-blip2_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-6.7b-coco")
-blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b-coco", device_map="auto", load_in_4bit=True, torch_dtype=torch.float16)
-instructblip_processor = AutoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
-instructblip_model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto", load_in_4bit=True, torch_dtype=torch.float16)
-def generate_caption(processor, model, image, tokenizer=None, use_float_16=False):
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    if use_float_16:
-        inputs = inputs.to(torch.float16)
-    generated_ids = model.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
-    if tokenizer is not None:
-        generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    else:
-        generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return generated_caption
-def generate_caption_blip2(processor, model, image, replace_token=False):
-    prompt = "A photo of"
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(device=model.device, dtype=torch.float16)
-    generated_ids = model.generate(**inputs,
-                                   num_beams=5, max_length=50, min_length=1, top_p=0.9,
-                                   repetition_penalty=1.5, length_penalty=1.0, temperature=1)
-    if replace_token:
-        # TODO remove once https://github.com/huggingface/transformers/pull/24492 is merged
-        generated_ids[generated_ids == 0] = 2
-    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-def generate_captions(image):
-    caption_git_large_coco = generate_caption(git_processor_large_coco, git_model_large_coco, image)
-    caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)
-    caption_blip2 = generate_caption_blip2(blip2_processor, blip2_model, image).strip()
-    caption_instructblip = generate_caption_blip2(instructblip_processor, instructblip_model, image, replace_token=True)
-    return caption_git_large_coco, caption_blip_large, caption_blip2, caption_instructblip
-examples = [["cats.jpg"], ["stop_sign.png"], ["astronaut.jpg"]]
-outputs = [gr.outputs.Textbox(label="Caption generated by GIT-large fine-tuned on COCO"), gr.outputs.Textbox(label="Caption generated by BLIP-large"), gr.outputs.Textbox(label="Caption generated by BLIP-2 OPT 6.7b"), gr.outputs.Textbox(label="Caption generated by InstructBLIP"), ]
-title = "Interactive demo: comparing image captioning models"
-description = "Gradio Demo to compare GIT, BLIP, BLIP-2 and InstructBLIP, 4 state-of-the-art vision+language models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
-article = "<p style='text-align: center'><a href='https://huggingface.co/docs/transformers/main/model_doc/blip' target='_blank'>BLIP docs</a> | <a href='https://huggingface.co/docs/transformers/main/model_doc/git' target='_blank'>GIT docs</a></p>"
-interface = gr.Interface(fn=generate_captions,
-                         inputs=gr.inputs.Image(type="pil"),
-                         outputs=outputs,
-                         examples=examples,
-                         title=title,
-                         description=description,
-                         article=article,
-                         enable_queue=True)
-interface.launch(debug=True)

+from __future__ import annotations
+import os
+import gradio as gr
+import torch
+from gradio_client import Client
+DESCRIPTION = "# Comparing image captioning models"
+ORIGINAL_SPACE_INFO = """\
+- [GIT-large fine-tuned on COCO](https://huggingface.co/spaces/library-samples/image-captioning-with-git)
+- [BLIP-large](https://huggingface.co/spaces/library-samples/image-captioning-with-blip)
+- [BLIP-2 OPT 6.7B](https://huggingface.co/spaces/merve/BLIP2-with-transformers)
+- [BLIP-2 T5-XXL](https://huggingface.co/spaces/hysts/BLIP2-with-transformers)
+- [InstructBLIP](https://huggingface.co/spaces/library-samples/InstructBLIP)
+- [Fuyu-8B](https://huggingface.co/spaces/adept/fuyu-8b-demo)
+"""
+torch.hub.download_url_to_file("http://images.cocodataset.org/val2017/000000039769.jpg", "cats.jpg")
+torch.hub.download_url_to_file(
+    "https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png", "stop_sign.png"
+)
+torch.hub.download_url_to_file(
+    "https://cdn.openai.com/dall-e-2/demos/text2im/astronaut/horse/photo/0.jpg", "astronaut.jpg"
+)
+def generate_caption_git(image_path: str) -> str:
+    try:
+        client = Client("library-samples/image-captioning-with-git")
+        return client.predict(image_path, api_name="/caption")
+    except Exception:
+        gr.Warning("The GIT-large Space is currently unavailable. Please try again later.")
+        return ""
+def generate_caption_blip(image_path: str) -> str:
+    try:
+        client = Client("library-samples/image-captioning-with-blip")
+        return client.predict(image_path, "A picture of", api_name="/caption")
+    except Exception:
+        gr.Warning("The BLIP-large Space is currently unavailable. Please try again later.")
+        return ""
+def generate_caption_blip2_opt(image_path: str) -> str:
+    try:
+        client = Client("merve/BLIP2-with-transformers")
+        return client.predict(
+            image_path,
+            "Beam search",
+            1,  # temperature
+            1,  # length penalty
+            1.5,  # repetition penalty
+            api_name="/caption",
+        )
+    except Exception:
+        gr.Warning("The BLIP2 OPT6.7B Space is currently unavailable. Please try again later.")
+        return ""
+def generate_caption_blip2_t5xxl(image_path: str) -> str:
+    try:
+        client = Client("hysts/BLIP2-with-transformers")
+        return client.predict(
+            image_path,
+            "Beam search",
+            1,  # temperature
+            1,  # length penalty
+            1.5,  # repetition penalty
+            50,  # max length
+            1,  # min length
+            5,  # number of beams
+            0.9,  # top p
+            api_name="/caption",
+        )
+    except Exception:
+        gr.Warning("The BLIP2 T5-XXL Space is currently unavailable. Please try again later.")
+        return ""
+def generate_caption_instructblip(image_path: str) -> str:
+    try:
+        client = Client("library-samples/InstructBLIP")
+        return client.predict(
+            image_path,
+            "Describe the image.",
+            "Beam search",
+            5,  # beam size
+            256,  # max length
+            1,  # min length
+            0.9,  # top p
+            1.5,  # repetition penalty
+            1.0,  # length penalty
+            1.0,  # temperature
+            api_name="/run",
+        )
+    except Exception:
+        gr.Warning("The InstructBLIP Space is currently unavailable. Please try again later.")
+        return ""
+def generate_caption_fuyu(image_path: str) -> str:
+    try:
+        client = Client("adept/fuyu-8b-demo")
+        return client.predict(image_path, "Generate a coco style caption.", fn_index=3)
+    except Exception:
+        gr.Warning("The Fuyu-8B Space is currently unavailable. Please try again later.")
+        return ""
+def generate_captions(image_path: str) -> tuple[str, str, str, str, str, str]:
+    return (
+        generate_caption_git(image_path),
+        generate_caption_blip(image_path),
+        generate_caption_blip2_opt(image_path),
+        generate_caption_blip2_t5xxl(image_path),
+        generate_caption_instructblip(image_path),
+        generate_caption_fuyu(image_path),
+    )
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="filepath")
+            run_button = gr.Button("Caption")
+        with gr.Column():
+            out_git = gr.Textbox(label="GIT-large fine-tuned on COCO")
+            out_blip = gr.Textbox(label="BLIP-large")
+            out_blip2_opt = gr.Textbox(label="BLIP-2 OPT 6.7B")
+            out_blip2_t5xxl = gr.Textbox(label="BLIP-2 T5-XXL")
+            out_instructblip = gr.Textbox(label="InstructBLIP")
+            out_fuyu = gr.Textbox(label="Fuyu-8B")
+    outputs = [
+        out_git,
+        out_blip,
+        out_blip2_opt,
+        out_blip2_t5xxl,
+        out_instructblip,
+        out_fuyu,
+    ]
+    gr.Examples(
+        examples=[
+            "cats.jpg",
+            "stop_sign.png",
+            "astronaut.jpg",
+        ],
+        inputs=input_image,
+        outputs=outputs,
+        fn=generate_captions,
+        cache_examples=os.getenv("CACHE_EXAMPLES") == "1",
+    )
+    with gr.Accordion(label="The original Spaces can be found here:", open=False):
+        gr.Markdown(ORIGINAL_SPACE_INFO)
+    run_button.click(
+        fn=generate_captions,
+        inputs=input_image,
+        outputs=outputs,
+        api_name="caption",
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,2 @@
-git+https://github.com/huggingface/transformers.git@main
 torch
-open_clip_torch
-accelerate
-bitsandbytes
-scipy


1	+ gradio
2	torch

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+h1 {
+  text-align: center;
+}