nielsr HF staff hysts HF staff commited on
Commit
c3bc979
1 Parent(s): 6a931bc

Use gradio_client (#49)

Browse files

- Use gradio_client (3cb35e062f030ebbc9622f243e198c7a19408c03)


Co-authored-by: hysts <[email protected]>

Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +166 -81
  3. requirements.txt +1 -5
  4. style.css +4 -0
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🔥
4
  colorFrom: yellow
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.15.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: yellow
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.50.2
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -1,83 +1,168 @@
1
- import gradio as gr
2
- from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration, VisionEncoderDecoderModel, InstructBlipForConditionalGeneration
3
- import torch
4
- import open_clip
5
-
6
- from huggingface_hub import hf_hub_download
7
-
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
-
10
- torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
11
- torch.hub.download_url_to_file('https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png', 'stop_sign.png')
12
- torch.hub.download_url_to_file('https://cdn.openai.com/dall-e-2/demos/text2im/astronaut/horse/photo/0.jpg', 'astronaut.jpg')
13
-
14
- git_processor_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco")
15
- git_model_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco").to(device)
16
-
17
- blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
18
- blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
19
-
20
- blip2_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-6.7b-coco")
21
- blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b-coco", device_map="auto", load_in_4bit=True, torch_dtype=torch.float16)
22
-
23
- instructblip_processor = AutoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
24
- instructblip_model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto", load_in_4bit=True, torch_dtype=torch.float16)
25
-
26
- def generate_caption(processor, model, image, tokenizer=None, use_float_16=False):
27
- inputs = processor(images=image, return_tensors="pt").to(device)
28
-
29
- if use_float_16:
30
- inputs = inputs.to(torch.float16)
31
-
32
- generated_ids = model.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
33
-
34
- if tokenizer is not None:
35
- generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
36
- else:
37
- generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
38
-
39
- return generated_caption
40
 
 
41
 
42
- def generate_caption_blip2(processor, model, image, replace_token=False):
43
- prompt = "A photo of"
44
- inputs = processor(images=image, text=prompt, return_tensors="pt").to(device=model.device, dtype=torch.float16)
45
-
46
- generated_ids = model.generate(**inputs,
47
- num_beams=5, max_length=50, min_length=1, top_p=0.9,
48
- repetition_penalty=1.5, length_penalty=1.0, temperature=1)
49
- if replace_token:
50
- # TODO remove once https://github.com/huggingface/transformers/pull/24492 is merged
51
- generated_ids[generated_ids == 0] = 2
52
-
53
- return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
54
-
55
-
56
- def generate_captions(image):
57
- caption_git_large_coco = generate_caption(git_processor_large_coco, git_model_large_coco, image)
58
-
59
- caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)
60
-
61
- caption_blip2 = generate_caption_blip2(blip2_processor, blip2_model, image).strip()
62
-
63
- caption_instructblip = generate_caption_blip2(instructblip_processor, instructblip_model, image, replace_token=True)
64
-
65
- return caption_git_large_coco, caption_blip_large, caption_blip2, caption_instructblip
66
-
67
-
68
- examples = [["cats.jpg"], ["stop_sign.png"], ["astronaut.jpg"]]
69
- outputs = [gr.outputs.Textbox(label="Caption generated by GIT-large fine-tuned on COCO"), gr.outputs.Textbox(label="Caption generated by BLIP-large"), gr.outputs.Textbox(label="Caption generated by BLIP-2 OPT 6.7b"), gr.outputs.Textbox(label="Caption generated by InstructBLIP"), ]
70
-
71
- title = "Interactive demo: comparing image captioning models"
72
- description = "Gradio Demo to compare GIT, BLIP, BLIP-2 and InstructBLIP, 4 state-of-the-art vision+language models. To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
73
- article = "<p style='text-align: center'><a href='https://huggingface.co/docs/transformers/main/model_doc/blip' target='_blank'>BLIP docs</a> | <a href='https://huggingface.co/docs/transformers/main/model_doc/git' target='_blank'>GIT docs</a></p>"
74
-
75
- interface = gr.Interface(fn=generate_captions,
76
- inputs=gr.inputs.Image(type="pil"),
77
- outputs=outputs,
78
- examples=examples,
79
- title=title,
80
- description=description,
81
- article=article,
82
- enable_queue=True)
83
- interface.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ import os
4
 
5
+ import gradio as gr
6
+ import torch
7
+ from gradio_client import Client
8
+
9
+ DESCRIPTION = "# Comparing image captioning models"
10
+ ORIGINAL_SPACE_INFO = """\
11
+ - [GIT-large fine-tuned on COCO](https://huggingface.co/spaces/library-samples/image-captioning-with-git)
12
+ - [BLIP-large](https://huggingface.co/spaces/library-samples/image-captioning-with-blip)
13
+ - [BLIP-2 OPT 6.7B](https://huggingface.co/spaces/merve/BLIP2-with-transformers)
14
+ - [BLIP-2 T5-XXL](https://huggingface.co/spaces/hysts/BLIP2-with-transformers)
15
+ - [InstructBLIP](https://huggingface.co/spaces/library-samples/InstructBLIP)
16
+ - [Fuyu-8B](https://huggingface.co/spaces/adept/fuyu-8b-demo)
17
+ """
18
+
19
+ torch.hub.download_url_to_file("http://images.cocodataset.org/val2017/000000039769.jpg", "cats.jpg")
20
+ torch.hub.download_url_to_file(
21
+ "https://huggingface.co/datasets/nielsr/textcaps-sample/resolve/main/stop_sign.png", "stop_sign.png"
22
+ )
23
+ torch.hub.download_url_to_file(
24
+ "https://cdn.openai.com/dall-e-2/demos/text2im/astronaut/horse/photo/0.jpg", "astronaut.jpg"
25
+ )
26
+
27
+
28
+ def generate_caption_git(image_path: str) -> str:
29
+ try:
30
+ client = Client("library-samples/image-captioning-with-git")
31
+ return client.predict(image_path, api_name="/caption")
32
+ except Exception:
33
+ gr.Warning("The GIT-large Space is currently unavailable. Please try again later.")
34
+ return ""
35
+
36
+
37
+ def generate_caption_blip(image_path: str) -> str:
38
+ try:
39
+ client = Client("library-samples/image-captioning-with-blip")
40
+ return client.predict(image_path, "A picture of", api_name="/caption")
41
+ except Exception:
42
+ gr.Warning("The BLIP-large Space is currently unavailable. Please try again later.")
43
+ return ""
44
+
45
+
46
+ def generate_caption_blip2_opt(image_path: str) -> str:
47
+ try:
48
+ client = Client("merve/BLIP2-with-transformers")
49
+ return client.predict(
50
+ image_path,
51
+ "Beam search",
52
+ 1, # temperature
53
+ 1, # length penalty
54
+ 1.5, # repetition penalty
55
+ api_name="/caption",
56
+ )
57
+ except Exception:
58
+ gr.Warning("The BLIP2 OPT6.7B Space is currently unavailable. Please try again later.")
59
+ return ""
60
+
61
+
62
+ def generate_caption_blip2_t5xxl(image_path: str) -> str:
63
+ try:
64
+ client = Client("hysts/BLIP2-with-transformers")
65
+ return client.predict(
66
+ image_path,
67
+ "Beam search",
68
+ 1, # temperature
69
+ 1, # length penalty
70
+ 1.5, # repetition penalty
71
+ 50, # max length
72
+ 1, # min length
73
+ 5, # number of beams
74
+ 0.9, # top p
75
+ api_name="/caption",
76
+ )
77
+ except Exception:
78
+ gr.Warning("The BLIP2 T5-XXL Space is currently unavailable. Please try again later.")
79
+ return ""
80
+
81
+
82
+ def generate_caption_instructblip(image_path: str) -> str:
83
+ try:
84
+ client = Client("library-samples/InstructBLIP")
85
+ return client.predict(
86
+ image_path,
87
+ "Describe the image.",
88
+ "Beam search",
89
+ 5, # beam size
90
+ 256, # max length
91
+ 1, # min length
92
+ 0.9, # top p
93
+ 1.5, # repetition penalty
94
+ 1.0, # length penalty
95
+ 1.0, # temperature
96
+ api_name="/run",
97
+ )
98
+ except Exception:
99
+ gr.Warning("The InstructBLIP Space is currently unavailable. Please try again later.")
100
+ return ""
101
+
102
+
103
+ def generate_caption_fuyu(image_path: str) -> str:
104
+ try:
105
+ client = Client("adept/fuyu-8b-demo")
106
+ return client.predict(image_path, "Generate a coco style caption.", fn_index=3)
107
+ except Exception:
108
+ gr.Warning("The Fuyu-8B Space is currently unavailable. Please try again later.")
109
+ return ""
110
+
111
+
112
+ def generate_captions(image_path: str) -> tuple[str, str, str, str, str, str]:
113
+ return (
114
+ generate_caption_git(image_path),
115
+ generate_caption_blip(image_path),
116
+ generate_caption_blip2_opt(image_path),
117
+ generate_caption_blip2_t5xxl(image_path),
118
+ generate_caption_instructblip(image_path),
119
+ generate_caption_fuyu(image_path),
120
+ )
121
+
122
+
123
+ with gr.Blocks(css="style.css") as demo:
124
+ gr.Markdown(DESCRIPTION)
125
+ with gr.Row():
126
+ with gr.Column():
127
+ input_image = gr.Image(type="filepath")
128
+ run_button = gr.Button("Caption")
129
+ with gr.Column():
130
+ out_git = gr.Textbox(label="GIT-large fine-tuned on COCO")
131
+ out_blip = gr.Textbox(label="BLIP-large")
132
+ out_blip2_opt = gr.Textbox(label="BLIP-2 OPT 6.7B")
133
+ out_blip2_t5xxl = gr.Textbox(label="BLIP-2 T5-XXL")
134
+ out_instructblip = gr.Textbox(label="InstructBLIP")
135
+ out_fuyu = gr.Textbox(label="Fuyu-8B")
136
+
137
+ outputs = [
138
+ out_git,
139
+ out_blip,
140
+ out_blip2_opt,
141
+ out_blip2_t5xxl,
142
+ out_instructblip,
143
+ out_fuyu,
144
+ ]
145
+ gr.Examples(
146
+ examples=[
147
+ "cats.jpg",
148
+ "stop_sign.png",
149
+ "astronaut.jpg",
150
+ ],
151
+ inputs=input_image,
152
+ outputs=outputs,
153
+ fn=generate_captions,
154
+ cache_examples=os.getenv("CACHE_EXAMPLES") == "1",
155
+ )
156
+
157
+ with gr.Accordion(label="The original Spaces can be found here:", open=False):
158
+ gr.Markdown(ORIGINAL_SPACE_INFO)
159
+
160
+ run_button.click(
161
+ fn=generate_captions,
162
+ inputs=input_image,
163
+ outputs=outputs,
164
+ api_name="caption",
165
+ )
166
+
167
+ if __name__ == "__main__":
168
+ demo.queue(max_size=20).launch()
requirements.txt CHANGED
@@ -1,6 +1,2 @@
1
- git+https://github.com/huggingface/transformers.git@main
2
  torch
3
- open_clip_torch
4
- accelerate
5
- bitsandbytes
6
- scipy
 
1
+ gradio
2
  torch
 
 
 
 
style.css ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ }
4
+