Spaces:

AIBoy1993
/

segment_anything_webui

Build error

App Files Files Community

AIBoy1993 commited on Apr 14, 2023

Commit

ae97c0d

•

1 Parent(s): 6cc1ca0

Upload 2 files

Browse files

Files changed (2) hide show

app.py +104 -21
inference.py +69 -37

app.py CHANGED Viewed

@@ -1,8 +1,35 @@
 import os
 import gradio as gr
 from inference import run_inference
 with gr.Blocks() as demo:
@@ -19,7 +46,7 @@ with gr.Blocks() as demo:
             # select device
             device = gr.Dropdown(["cpu", "cuda"], value='cpu', label="Select Device")
-    # parameters
     with gr.Accordion(label='Parameters', open=False):
         with gr.Row():
             points_per_side = gr.Number(value=32, label="points_per_side", precision=0,
@@ -45,11 +72,21 @@ with gr.Blocks() as demo:
                                         info='''The box IoU cutoff used by non-maximal suppression to filter duplicate
                                         masks between different crops.''')
-    # Show image
     with gr.Tab(label='Image'):
         with gr.Row().style(equal_height=True):
             with gr.Column():
                 input_image = gr.Image(type="numpy")
                 text = gr.Textbox(label='Text prompt(optional)', info=
                     'If you type words, the OWL-ViT model will be used to detect the objects in the image, '
                     'and the boxes will be feed into SAM model to predict mask. Please use English.',
@@ -57,28 +94,26 @@ with gr.Blocks() as demo:
                 owl_vit_threshold = gr.Slider(value=0.1, minimum=0, maximum=1.0, step=0.01, label="OWL ViT Object Detection threshold",
                                             info='''A small threshold will generate more objects, but may causing OOM.
                                             A big threshold may not detect objects, resulting in an error ''')
                 button = gr.Button("Auto!")
             with gr.Tab(label='Image+Mask'):
                 output_image = gr.Image(type='numpy')
             with gr.Tab(label='Mask'):
                 output_mask = gr.Image(type='numpy')
-        gr.Examples(
-            examples=[os.path.join(os.path.dirname(__file__), "./images/53960-scaled.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/2388455-scaled.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/1.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/2.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/3.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/4.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/5.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/6.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/7.jpg"),
-                      os.path.join(os.path.dirname(__file__), "./images/8.jpg"),
-                      ],
-            inputs=input_image,
-            outputs=output_image,
         )
-    # Show video
     with gr.Tab(label='Video'):
         with gr.Row().style(equal_height=True):
             with gr.Column():
@@ -90,17 +125,65 @@ with gr.Blocks() as demo:
         **Note:** processing video will take a long time, please upload a short video.
         ''')
         gr.Examples(
-            examples=[os.path.join(os.path.dirname(__file__), "./images/video1.mp4"),
-                      os.path.join(os.path.dirname(__file__), "./images/video2.mp4")
-                      ],
             inputs=input_video,
             outputs=output_video
         )
     # button image
     button.click(run_inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
                                     min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
-                                    crop_nms_thresh, owl_vit_threshold, input_image, text],
                  outputs=[output_image, output_mask])
     # button video
     button_video.click(run_inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,

 import os
+import cv2
+import numpy as np
 import gradio as gr
 from inference import run_inference
+# points color and marker
+colors = [(255, 0, 0), (0, 255, 0)]
+markers = [1, 5]
+# image examples
+# in each list, the first element is image path,
+# the second is id (used for original_image State),
+# the third is an empty list (used for selected_points State)
+image_examples = [
+    [os.path.join(os.path.dirname(__file__), "./images/53960-scaled.jpg"), 0, []],
+    [os.path.join(os.path.dirname(__file__), "./images/2388455-scaled.jpg"), 1, []],
+    [os.path.join(os.path.dirname(__file__), "./images/1.jpg"),2,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/2.jpg"),3,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/3.jpg"),4,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/4.jpg"),5,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/5.jpg"),6,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/6.jpg"),7,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/7.jpg"),8,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/8.jpg"),9,[]]
+]
+# video examples
+video_examples = [
+    os.path.join(os.path.dirname(__file__), "./images/video1.mp4"),
+    os.path.join(os.path.dirname(__file__), "./images/video2.mp4")
+]
 with gr.Blocks() as demo:
             # select device
             device = gr.Dropdown(["cpu", "cuda"], value='cpu', label="Select Device")
+    # SAM parameters
     with gr.Accordion(label='Parameters', open=False):
         with gr.Row():
             points_per_side = gr.Number(value=32, label="points_per_side", precision=0,
                                         info='''The box IoU cutoff used by non-maximal suppression to filter duplicate
                                         masks between different crops.''')
+    # Segment image
     with gr.Tab(label='Image'):
         with gr.Row().style(equal_height=True):
             with gr.Column():
+                # input image
+                original_image = gr.State(value=None)   # store original image without points, default None
                 input_image = gr.Image(type="numpy")
+                # point prompt
+                with gr.Column():
+                    selected_points = gr.State([])      # store points
+                    with gr.Row():
+                        gr.Markdown('You can click on the image to select points prompt. Default: foreground_point.')
+                        undo_button = gr.Button('Undo point')
+                    radio = gr.Radio(['foreground_point', 'background_point'], label='point labels')
+                # text prompt to generate box prompt
                 text = gr.Textbox(label='Text prompt(optional)', info=
                     'If you type words, the OWL-ViT model will be used to detect the objects in the image, '
                     'and the boxes will be feed into SAM model to predict mask. Please use English.',
                 owl_vit_threshold = gr.Slider(value=0.1, minimum=0, maximum=1.0, step=0.01, label="OWL ViT Object Detection threshold",
                                             info='''A small threshold will generate more objects, but may causing OOM.
                                             A big threshold may not detect objects, resulting in an error ''')
+                # run button
                 button = gr.Button("Auto!")
+            # show the image with mask
             with gr.Tab(label='Image+Mask'):
                 output_image = gr.Image(type='numpy')
+            # show only mask
             with gr.Tab(label='Mask'):
                 output_mask = gr.Image(type='numpy')
+        def process_example(img, ori_img, sel_p):
+            return ori_img, []
+        example = gr.Examples(
+            examples=image_examples,
+            inputs=[input_image, original_image, selected_points],
+            outputs=[original_image, selected_points],
+	        fn=process_example,
+	        run_on_click=True
         )
+    # Segment video
     with gr.Tab(label='Video'):
         with gr.Row().style(equal_height=True):
             with gr.Column():
         **Note:** processing video will take a long time, please upload a short video.
         ''')
         gr.Examples(
+            examples=video_examples,
             inputs=input_video,
             outputs=output_video
         )
+    # once user upload an image, the original image is stored in `original_image`
+    def store_img(img):
+        return img, []  # when new image is uploaded, `selected_points` should be empty
+    input_image.upload(
+        store_img,
+        [input_image],
+        [original_image, selected_points]
+    )
+    # user click the image to get points, and show the points on the image
+    def get_point(img, sel_pix, point_type, evt: gr.SelectData):
+        if point_type == 'foreground_point':
+            sel_pix.append((evt.index, 1))   # append the foreground_point
+        elif point_type == 'background_point':
+            sel_pix.append((evt.index, 0))    # append the background_point
+        else:
+            sel_pix.append((evt.index, 1))    # default foreground_point
+        # draw points
+        for point, label in sel_pix:
+            cv2.drawMarker(img, point, colors[label], markerType=markers[label], markerSize=20, thickness=5)
+        if img[..., 0][0, 0] == img[..., 2][0, 0]:  # BGR to RGB
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        return img if isinstance(img, np.ndarray) else np.array(img)
+    input_image.select(
+        get_point,
+        [input_image, selected_points, radio],
+        [input_image],
+    )
+    # undo the selected point
+    def undo_points(orig_img, sel_pix):
+        if isinstance(orig_img, int):   # if orig_img is int, the image if select from examples
+            temp = cv2.imread(image_examples[orig_img][0])
+            temp = cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)
+        else:
+            temp = orig_img.copy()
+        # draw points
+        if len(sel_pix) != 0:
+            sel_pix.pop()
+            for point, label in sel_pix:
+                cv2.drawMarker(temp, point, colors[label], markerType=markers[label], markerSize=20, thickness=5)
+        if temp[..., 0][0, 0] == temp[..., 2][0, 0]:  # BGR to RGB
+            temp = cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)
+        return temp if isinstance(temp, np.ndarray) else np.array(temp)
+    undo_button.click(
+        undo_points,
+        [original_image, selected_points],
+        [input_image]
+    )
     # button image
     button.click(run_inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
                                     min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
+                                    crop_nms_thresh, owl_vit_threshold, original_image, text, selected_points],
                  outputs=[output_image, output_mask])
     # button video
     button_video.click(run_inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,

inference.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import cv2
 import torch
 import numpy as np
@@ -13,6 +14,19 @@ models = {
 	'vit_h': './checkpoints/sam_vit_h_4b8939.pth'
 }
 def plot_boxes(img, boxes):
 	img_pil = Image.fromarray(np.uint8(img * 255)).convert('RGB')
@@ -82,44 +96,55 @@ def generator_inference(device, model_type, points_per_side, pred_iou_thresh, st
 		return 'output.mp4'
-def predictor_inference(device, model_type, input_x, input_text, owl_vit_threshold=0.1):
 	# sam model
 	sam = sam_model_registry[model_type](checkpoint=models[model_type]).to(device)
 	predictor = SamPredictor(sam)
 	predictor.set_image(input_x)  # Process the image to produce an image embedding
-	# split input text
-	input_text = [input_text.split(',')]
-	# OWL-ViT model
-	# processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
-	# owlvit_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
-	processor = OwlViTProcessor.from_pretrained('./checkpoints/models--google--owlvit-base-patch32')
-	owlvit_model = OwlViTForObjectDetection.from_pretrained("./checkpoints/models--google--owlvit-base-patch32").to(device)
-	# get outputs
-	input_text = processor(text=input_text, images=input_x, return_tensors="pt").to(device)
-	outputs = owlvit_model(**input_text)
-	target_size = torch.Tensor([input_x.shape[:2]]).to(device)
-	results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_size,
-	                                                  threshold=owl_vit_threshold)
-	# get the box with best score
-	scores = torch.sigmoid(outputs.logits)
-	# best_scores, best_idxs = torch.topk(scores, k=1, dim=1)
-	# best_idxs = best_idxs.squeeze(1).tolist()
-	i = 0  # Retrieve predictions for the first image for the corresponding text queries
-	boxes_tensor = results[i]["boxes"]  # [best_idxs]
-	print(boxes_tensor.size())
-	boxes = boxes_tensor.cpu().detach().numpy()
-	transformed_boxes = predictor.transform.apply_boxes_torch(torch.Tensor(boxes).to(device),
-	                                                          input_x.shape[:2])  # apply transform to original boxes
 	# predict segmentation according to the boxes
 	masks, scores, logits = predictor.predict_torch(
-		point_coords=None,
-		point_labels=None,
 		boxes=transformed_boxes,  # only one box
 		multimask_output=False,
 	)
@@ -130,11 +155,13 @@ def predictor_inference(device, model_type, input_x, input_text, owl_vit_thresho
 		for i in range(3):
 			mask_all[ann[0] == True, i] = color_mask[i]
 	img = input_x / 255 * 0.3 + mask_all * 0.7
-	img = plot_boxes(img, boxes_tensor)  # image + mask + boxes
 	# free the memory
-	owlvit_model.cpu()
-	del owlvit_model
 	del input_text
 	gc.collect()
 	torch.cuda.empty_cache()
@@ -144,11 +171,16 @@ def predictor_inference(device, model_type, input_x, input_text, owl_vit_thresho
 def run_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh, min_mask_region_area,
                   stability_score_offset, box_nms_thresh, crop_n_layers, crop_nms_thresh, owl_vit_threshold, input_x,
-                  input_text):
-	print('prompt text: ', input_text)
-	if input_text != '' and not isinstance(input_x, str):  # user input text
 		print('use predictor_inference')
-		return predictor_inference(device, model_type, input_x, input_text, owl_vit_threshold)
 	else:
 		print('use generator_inference')
 		return generator_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,

+import os
 import cv2
 import torch
 import numpy as np
 	'vit_h': './checkpoints/sam_vit_h_4b8939.pth'
 }
+image_examples = [
+    [os.path.join(os.path.dirname(__file__), "./images/53960-scaled.jpg"), 0, []],
+    [os.path.join(os.path.dirname(__file__), "./images/2388455-scaled.jpg"), 1, []],
+    [os.path.join(os.path.dirname(__file__), "./images/1.jpg"),2,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/2.jpg"),3,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/3.jpg"),4,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/4.jpg"),5,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/5.jpg"),6,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/6.jpg"),7,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/7.jpg"),8,[]],
+    [os.path.join(os.path.dirname(__file__), "./images/8.jpg"),9,[]]
+]
 def plot_boxes(img, boxes):
 	img_pil = Image.fromarray(np.uint8(img * 255)).convert('RGB')
 		return 'output.mp4'
+def predictor_inference(device, model_type, input_x, input_text, selected_points, owl_vit_threshold=0.1):
 	# sam model
 	sam = sam_model_registry[model_type](checkpoint=models[model_type]).to(device)
 	predictor = SamPredictor(sam)
 	predictor.set_image(input_x)  # Process the image to produce an image embedding
+	if input_text != '':
+		# split input text
+		input_text = [input_text.split(',')]
+		print(input_text)
+		# OWL-ViT model
+		processor = OwlViTProcessor.from_pretrained('./checkpoints/models--google--owlvit-base-patch32')
+		owlvit_model = OwlViTForObjectDetection.from_pretrained("./checkpoints/models--google--owlvit-base-patch32").to(device)
+		# get outputs
+		input_text = processor(text=input_text, images=input_x, return_tensors="pt").to(device)
+		outputs = owlvit_model(**input_text)
+		target_size = torch.Tensor([input_x.shape[:2]]).to(device)
+		results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_size,
+		                                                  threshold=owl_vit_threshold)
+		# get the box with best score
+		scores = torch.sigmoid(outputs.logits)
+		# best_scores, best_idxs = torch.topk(scores, k=1, dim=1)
+		# best_idxs = best_idxs.squeeze(1).tolist()
+		i = 0  # Retrieve predictions for the first image for the corresponding text queries
+		boxes_tensor = results[i]["boxes"]  # [best_idxs]
+		boxes = boxes_tensor.cpu().detach().numpy()
+		# boxes = boxes[np.newaxis, :, :]
+		transformed_boxes = predictor.transform.apply_boxes_torch(torch.Tensor(boxes).to(device),
+		                                                          input_x.shape[:2])  # apply transform to original boxes
+		# transformed_boxes = transformed_boxes.unsqueeze(0)
+		print(transformed_boxes.size(), boxes.shape)
+	else:
+		transformed_boxes = None
+	# points
+	if len(selected_points) != 0:
+		points = torch.Tensor([p for p, _ in selected_points]).to(device).unsqueeze(1)
+		labels = torch.Tensor([int(l) for _, l in selected_points]).to(device).unsqueeze(1)
+		transformed_points = predictor.transform.apply_coords_torch(points, input_x.shape[:2])
+		print(points.size(), transformed_points.size(), labels.size(), input_x.shape, points)
+	else:
+		transformed_points, labels = None, None
 	# predict segmentation according to the boxes
 	masks, scores, logits = predictor.predict_torch(
+		point_coords=transformed_points,
+		point_labels=labels,
 		boxes=transformed_boxes,  # only one box
 		multimask_output=False,
 	)
 		for i in range(3):
 			mask_all[ann[0] == True, i] = color_mask[i]
 	img = input_x / 255 * 0.3 + mask_all * 0.7
+	if input_text != '':
+		img = plot_boxes(img, boxes_tensor)  # image + mask + boxes
 	# free the memory
+	if input_text != '':
+		owlvit_model.cpu()
+		del owlvit_model
 	del input_text
 	gc.collect()
 	torch.cuda.empty_cache()
 def run_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh, min_mask_region_area,
                   stability_score_offset, box_nms_thresh, crop_n_layers, crop_nms_thresh, owl_vit_threshold, input_x,
+                  input_text, selected_points):
+	# if input_x is int, the image is selected from examples
+	if isinstance(input_x, int):
+		input_x = cv2.imread(image_examples[input_x][0])
+		input_x = cv2.cvtColor(input_x, cv2.COLOR_BGR2RGB)
+	if (input_text != '' and not isinstance(input_x, str)) or len(selected_points) != 0:  # user input text or points
 		print('use predictor_inference')
+		print('prompt text: ', input_text)
+		print('prompt points length: ', len(selected_points))
+		return predictor_inference(device, model_type, input_x, input_text, selected_points, owl_vit_threshold)
 	else:
 		print('use generator_inference')
 		return generator_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,