Spaces:

AIBoy1993
/

segment_anything_webui

Build error

App Files Files Community

AIBoy1993 commited on Apr 11, 2023

Commit

ba0d063

•

1 Parent(s): 7301e8a

Upload 4 files

Browse files

Files changed (4) hide show

README.md +18 -16
app.py +15 -74
inference.py +156 -0
requirements.txt +1 -2

README.md CHANGED Viewed

@@ -1,25 +1,18 @@
----
-title: Segment Anything
-emoji: 🚀
-colorFrom: gray
-colorTo: pink
-sdk: gradio
-sdk_version: 3.24.1
-app_file: app.py
-pinned: false
----
 # Segment Anything WebUI
-This project is based on **[Segment Anything Model](https://segment-anything.com/) ** by Meta. The UI is based on [Gradio](https://gradio.app/).
 - Try deme on HF: [AIBoy1993/segment_anything_webui](https://huggingface.co/spaces/AIBoy1993/segment_anything_webui)
 ![](./images/20230408023615.png)
 ## Change Logs
-- [2023-4-11] Support video segmentation.
 ## **Usage**
@@ -45,16 +38,25 @@ git clone https://github.com/5663015/segment_anything_webui.git
   - `vit_b`: [ViT-B SAM model](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth)
 - Run：
 ```
 python app.py
 ```
-**Note：** Default model is `vit_b`，the demo can run on CPU. Default device is `cuda`。
 ## TODO
-- Add segmentation prompt (point and box)
-- Add text prompt

 # Segment Anything WebUI
+This project is based on **[Segment Anything Model](https://segment-anything.com/)** by Meta. The UI is based on [Gradio](https://gradio.app/).
 - Try deme on HF: [AIBoy1993/segment_anything_webui](https://huggingface.co/spaces/AIBoy1993/segment_anything_webui)
+- [GitHub](https://github.com/5663015/segment_anything_webui)
 ![](./images/20230408023615.png)
 ## Change Logs
+- [2023-4-11]
+  - Support video segmentation. A short video can be automatically segmented by SAM.
+  - Support text prompt segmentation using [OWL-ViT](https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/owlvit#overview) (Vision Transformer for Open-World Localization) model.
 ## **Usage**
   - `vit_b`: [ViT-B SAM model](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth)
+- Under `checkpoints`, make a new folder named `models--google--owlvit-base-patch32`, and put the downloaded [OWL-ViT weights](https://huggingface.co/google/owlvit-base-patch32) files in `models--google--owlvit-base-patch32`.
 - Run：
 ```
 python app.py
 ```
+**Note：** Default model is `vit_b`，the demo can run on CPU. Default device is `cpu`。
 ## TODO
+- [x] Video segmentation
+- [x] Add text prompt
+- [ ] Add segmentation prompt (point and box)
+## Reference
+- Thanks to the wonderful work [Segment Anything](https://segment-anything.com/) and [OWL-ViT](https://arxiv.org/abs/2205.06230)
+- Some video processing code references [kadirnar/segment-anything-video](https://github.com/kadirnar/segment-anything-video), and some OWL-ViT code references [ngthanhtin/owlvit_segment_anything](https://github.com/ngthanhtin/owlvit_segment_anything).

app.py CHANGED Viewed

@@ -1,73 +1,8 @@
 import os
-import cv2
-import sys
-import numpy as np
 import gradio as gr
-from PIL import Image
-import matplotlib.pyplot as plt
-from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
-models = {
-    'vit_b': './checkpoints/sam_vit_b_01ec64.pth',
-    'vit_l': './checkpoints/sam_vit_l_0b3195.pth',
-    'vit_h': './checkpoints/sam_vit_h_4b8939.pth'
-}
-def segment_one(img, mask_generator, seed=None):
-    if seed is not None:
-        np.random.seed(seed)
-    masks = mask_generator.generate(img)
-    sorted_anns = sorted(masks, key=(lambda x: x['area']), reverse=True)
-    mask_all = np.ones((img.shape[0], img.shape[1], 3))
-    for ann in sorted_anns:
-        m = ann['segmentation']
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        for i in range(3):
-            mask_all[m == True, i] = color_mask[i]
-    result = img / 255 * 0.3 + mask_all * 0.7
-    return result, mask_all
-def inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh, min_mask_region_area,
-                  stability_score_offset, box_nms_thresh, crop_n_layers, crop_nms_thresh, input_x, progress=gr.Progress()):
-    # sam model
-    sam = sam_model_registry[model_type](checkpoint=models[model_type]).to(device)
-    mask_generator = SamAutomaticMaskGenerator(
-        sam,
-        points_per_side=points_per_side,
-        pred_iou_thresh=pred_iou_thresh,
-        stability_score_thresh=stability_score_thresh,
-        stability_score_offset=stability_score_offset,
-        box_nms_thresh=box_nms_thresh,
-        crop_n_layers=crop_n_layers,
-        crop_nms_thresh=crop_nms_thresh,
-        crop_overlap_ratio=512 / 1500,
-        crop_n_points_downscale_factor=1,
-        point_grids=None,
-        min_mask_region_area=min_mask_region_area,
-        output_mode='binary_mask'
-    )
-    # input is image, type: numpy
-    if type(input_x) == np.ndarray:
-        result, mask_all = segment_one(input_x, mask_generator)
-        return result, mask_all
-    elif isinstance(input_x, str):  # input is video, type: path (str)
-        cap = cv2.VideoCapture(input_x)     # read video
-        frames_num = cap.get(cv2.CAP_PROP_FRAME_COUNT)
-        W, H = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        fps = int(cap.get(cv2.CAP_PROP_FPS))
-        out = cv2.VideoWriter("output.mp4", cv2.VideoWriter_fourcc('x', '2', '6', '4'), fps, (W, H), isColor=True)
-        for _ in progress.tqdm(range(int(frames_num)), desc='Processing video ({} frames, size {}x{})'.format(int(frames_num), W, H)):
-            ret, frame = cap.read()     # read a frame
-            result, mask_all = segment_one(frame, mask_generator, seed=2023)
-            result = (result * 255).astype(np.uint8)
-            out.write(result)
-        out.release()
-        cap.release()
-        return 'output.mp4'
 with gr.Blocks() as demo:
@@ -82,9 +17,9 @@ with gr.Blocks() as demo:
             # select model
             model_type = gr.Dropdown(["vit_b", "vit_l", "vit_h"], value='vit_b', label="Select Model")
             # select device
-            device = gr.Dropdown(["cpu"], value='cpu', label="Select Device")
-    # 参数
     with gr.Accordion(label='Parameters', open=False):
         with gr.Row():
             points_per_side = gr.Number(value=32, label="points_per_side", precision=0,
@@ -115,8 +50,14 @@ with gr.Blocks() as demo:
         with gr.Row().style(equal_height=True):
             with gr.Column():
                 input_image = gr.Image(type="numpy")
-                with gr.Row():
-                    button = gr.Button("Auto!")
             with gr.Tab(label='Image+Mask'):
                 output_image = gr.Image(type='numpy')
             with gr.Tab(label='Mask'):
@@ -157,14 +98,14 @@ with gr.Blocks() as demo:
         )
     # button image
-    button.click(inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
                                     min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
-                                    crop_nms_thresh, input_image],
                  outputs=[output_image, output_mask])
     # button video
-    button_video.click(inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
                                     min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
-                                    crop_nms_thresh, input_video],
                        outputs=[output_video])

 import os
 import gradio as gr
+from inference import run_inference
 with gr.Blocks() as demo:
             # select model
             model_type = gr.Dropdown(["vit_b", "vit_l", "vit_h"], value='vit_b', label="Select Model")
             # select device
+            device = gr.Dropdown(["cpu", "cuda"], value='cpu', label="Select Device")
+    # parameters
     with gr.Accordion(label='Parameters', open=False):
         with gr.Row():
             points_per_side = gr.Number(value=32, label="points_per_side", precision=0,
         with gr.Row().style(equal_height=True):
             with gr.Column():
                 input_image = gr.Image(type="numpy")
+                text = gr.Textbox(label='Text prompt(optional)', info=
+                    'If you type words, the OWL-ViT model will be used to detect the objects in the image, '
+                    'and the boxes will be feed into SAM model to predict mask. Please use English.',
+                                  placeholder='Multiple words are separated by commas')
+                owl_vit_threshold = gr.Slider(value=0.1, minimum=0, maximum=1.0, step=0.01, label="OWL ViT Object Detection threshold",
+                                            info='''A small threshold will generate more objects, but may causing OOM.
+                                            A big threshold may not detect objects, resulting in an error ''')
+                button = gr.Button("Auto!")
             with gr.Tab(label='Image+Mask'):
                 output_image = gr.Image(type='numpy')
             with gr.Tab(label='Mask'):
         )
     # button image
+    button.click(run_inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
                                     min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
+                                    crop_nms_thresh, owl_vit_threshold, input_image, text],
                  outputs=[output_image, output_mask])
     # button video
+    button_video.click(run_inference, inputs=[device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
                                     min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
+                                    crop_nms_thresh, owl_vit_threshold, input_video, text],
                        outputs=[output_video])

inference.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import cv2
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image, ImageDraw
+from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
+from transformers import OwlViTProcessor, OwlViTForObjectDetection
+import gc
+models = {
+	'vit_b': './checkpoints/sam_vit_b_01ec64.pth',
+	'vit_l': './checkpoints/sam_vit_l_0b3195.pth',
+	'vit_h': './checkpoints/sam_vit_h_4b8939.pth'
+}
+def plot_boxes(img, boxes):
+	img_pil = Image.fromarray(np.uint8(img * 255)).convert('RGB')
+	draw = ImageDraw.Draw(img_pil)
+	for box in boxes:
+		color = tuple(np.random.randint(0, 255, size=3).tolist())
+		x0, y0, x1, y1 = box
+		x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+		draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+	return img_pil
+def segment_one(img, mask_generator, seed=None):
+	if seed is not None:
+		np.random.seed(seed)
+	masks = mask_generator.generate(img)
+	sorted_anns = sorted(masks, key=(lambda x: x['area']), reverse=True)
+	mask_all = np.ones((img.shape[0], img.shape[1], 3))
+	for ann in sorted_anns:
+		m = ann['segmentation']
+		color_mask = np.random.random((1, 3)).tolist()[0]
+		for i in range(3):
+			mask_all[m == True, i] = color_mask[i]
+	result = img / 255 * 0.3 + mask_all * 0.7
+	return result, mask_all
+def generator_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
+                        min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers, crop_nms_thresh,
+                        input_x, progress=gr.Progress()):
+	# sam model
+	sam = sam_model_registry[model_type](checkpoint=models[model_type]).to(device)
+	mask_generator = SamAutomaticMaskGenerator(
+		sam,
+		points_per_side=points_per_side,
+		pred_iou_thresh=pred_iou_thresh,
+		stability_score_thresh=stability_score_thresh,
+		stability_score_offset=stability_score_offset,
+		box_nms_thresh=box_nms_thresh,
+		crop_n_layers=crop_n_layers,
+		crop_nms_thresh=crop_nms_thresh,
+		crop_overlap_ratio=512 / 1500,
+		crop_n_points_downscale_factor=1,
+		point_grids=None,
+		min_mask_region_area=min_mask_region_area,
+		output_mode='binary_mask'
+	)
+	# input is image, type: numpy
+	if type(input_x) == np.ndarray:
+		result, mask_all = segment_one(input_x, mask_generator)
+		return result, mask_all
+	elif isinstance(input_x, str):  # input is video, type: path (str)
+		cap = cv2.VideoCapture(input_x)  # read video
+		frames_num = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+		W, H = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+		fps = int(cap.get(cv2.CAP_PROP_FPS))
+		out = cv2.VideoWriter("output.mp4", cv2.VideoWriter_fourcc('x', '2', '6', '4'), fps, (W, H), isColor=True)
+		for _ in progress.tqdm(range(int(frames_num)),
+		                       desc='Processing video ({} frames, size {}x{})'.format(int(frames_num), W, H)):
+			ret, frame = cap.read()  # read a frame
+			result, mask_all = segment_one(frame, mask_generator, seed=2023)
+			result = (result * 255).astype(np.uint8)
+			out.write(result)
+		out.release()
+		cap.release()
+		return 'output.mp4'
+def predictor_inference(device, model_type, input_x, input_text, owl_vit_threshold=0.1):
+	# sam model
+	sam = sam_model_registry[model_type](checkpoint=models[model_type]).to(device)
+	predictor = SamPredictor(sam)
+	predictor.set_image(input_x)  # Process the image to produce an image embedding
+	# split input text
+	input_text = [input_text.split(',')]
+	# OWL-ViT model
+	# processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+	# owlvit_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
+	processor = OwlViTProcessor.from_pretrained('./checkpoints/models--google--owlvit-base-patch32')
+	owlvit_model = OwlViTForObjectDetection.from_pretrained("./checkpoints/models--google--owlvit-base-patch32").to(device)
+	# get outputs
+	input_text = processor(text=input_text, images=input_x, return_tensors="pt").to(device)
+	outputs = owlvit_model(**input_text)
+	target_size = torch.Tensor([input_x.shape[:2]]).to(device)
+	results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_size,
+	                                                  threshold=owl_vit_threshold)
+	# get the box with best score
+	scores = torch.sigmoid(outputs.logits)
+	# best_scores, best_idxs = torch.topk(scores, k=1, dim=1)
+	# best_idxs = best_idxs.squeeze(1).tolist()
+	i = 0  # Retrieve predictions for the first image for the corresponding text queries
+	boxes_tensor = results[i]["boxes"]  # [best_idxs]
+	print(boxes_tensor.size())
+	boxes = boxes_tensor.cpu().detach().numpy()
+	transformed_boxes = predictor.transform.apply_boxes_torch(torch.Tensor(boxes).to(device),
+	                                                          input_x.shape[:2])  # apply transform to original boxes
+	# predict segmentation according to the boxes
+	masks, scores, logits = predictor.predict_torch(
+		point_coords=None,
+		point_labels=None,
+		boxes=transformed_boxes,  # only one box
+		multimask_output=False,
+	)
+	masks = masks.cpu().detach().numpy()
+	mask_all = np.ones((input_x.shape[0], input_x.shape[1], 3))
+	for ann in masks:
+		color_mask = np.random.random((1, 3)).tolist()[0]
+		for i in range(3):
+			mask_all[ann[0] == True, i] = color_mask[i]
+	img = input_x / 255 * 0.3 + mask_all * 0.7
+	img = plot_boxes(img, boxes_tensor)  # image + mask + boxes
+	# free the memory
+	owlvit_model.cpu()
+	del owlvit_model
+	del input_text
+	gc.collect()
+	torch.cuda.empty_cache()
+	return img, mask_all
+def run_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh, min_mask_region_area,
+                  stability_score_offset, box_nms_thresh, crop_n_layers, crop_nms_thresh, owl_vit_threshold, input_x,
+                  input_text):
+	print('prompt text: ', input_text)
+	if input_text != '' and not isinstance(input_x, str):  # user input text
+		print('use predictor_inference')
+		return predictor_inference(device, model_type, input_x, input_text, owl_vit_threshold)
+	else:
+		print('use generator_inference')
+		return generator_inference(device, model_type, points_per_side, pred_iou_thresh, stability_score_thresh,
+		                           min_mask_region_area, stability_score_offset, box_nms_thresh, crop_n_layers,
+		                           crop_nms_thresh, input_x)

requirements.txt CHANGED Viewed

@@ -3,5 +3,4 @@ numpy==1.21.5
 opencv_python==4.6.0.66
 Pillow==9.5.0
 segment_anything==1.0
-torch
-torchvision

 opencv_python==4.6.0.66
 Pillow==9.5.0
 segment_anything==1.0
+transformers==4.27.4