Spaces:

martintomov
/

InsectModelZoo

Running

App Files Files Community

Martin Tomov commited on Jun 22

Commit

3657d52

•

1 Parent(s): cd7cf5e

HF IDEA-Research/grounding-dino-base

Browse files

Files changed (1) hide show

gsl_utils.py +20 -55

gsl_utils.py CHANGED Viewed

@@ -1,57 +1,32 @@
-# GSL
 import os
 import torch
 import numpy as np
 from PIL import Image, ImageChops, ImageEnhance
 import cv2
 from simple_lama_inpainting import SimpleLama
-from segment_anything import build_sam, SamPredictor
-from GroundingDINO.groundingdino.util import box_ops
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict
-from GroundingDINO.groundingdino.util.inference import annotate, load_image, predict
 from huggingface_hub import hf_hub_download
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
-    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
-    args = SLConfig.fromfile(cache_config_file)
-    args.device = device
-    model = build_model(args)
-    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
-    checkpoint = torch.load(cache_file, map_location=device)
-    model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
-    model.eval()
     return model
-groundingdino_model = load_model_hf(
-    repo_id="ShilongLiu/GroundingDINO",
-    filename="groundingdino_swinb_cogcoor.pth",
-    ckpt_config_filename="GroundingDINO_SwinB.cfg.py",
-    device=device
-)
-sam_predictor = SamPredictor(build_sam(checkpoint='sam_vit_h_4b8939.pth').to(device))
 simple_lama = SimpleLama()
 def detect(image, model, text_prompt='insect . flower . cloud', box_threshold=0.15, text_threshold=0.15):
-    boxes, logits, phrases = predict(
-        image=image,
-        model=model,
-        caption=text_prompt,
-        box_threshold=box_threshold,
-        text_threshold=text_threshold
-    )
-    annotated_frame = annotate(image_source=image, boxes=boxes, logits=logits, phrases=phrases)
-    annotated_frame = annotated_frame[..., ::-1]  # BGR to RGB
-    return annotated_frame, boxes, phrases
 def segment(image, sam_model, boxes):
     sam_model.set_image(image)
     H, W, _ = image.shape
-    boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes) * torch.Tensor([W, H, W, H])
     transformed_boxes = sam_model.transform.apply_boxes_torch(boxes_xyxy.to(device), image.shape[:2])
     masks, _, _ = sam_model.predict_torch(
         point_coords=None,
@@ -81,18 +56,12 @@ def dilate_mask(mask, dilate_factor=15):
     )
     return mask
-def gsl_process_image(local_image_path):
-    # Load image
-    image_source, image = load_image(local_image_path)
-    # Detect insects
-    annotated_frame, detected_boxes, phrases = detect(image, model=groundingdino_model)
-    indices = [i for i, s in enumerate(phrases) if 'insect' in s]
-    # Segment insects
-    segmented_frame_masks = segment(image_source, sam_predictor, detected_boxes[indices])
-    # Combine masks
     final_mask = None
     for i in range(len(segmented_frame_masks) - 1):
         if final_mask is None:
@@ -100,23 +69,19 @@ def gsl_process_image(local_image_path):
         else:
             final_mask = np.bitwise_or(final_mask, segmented_frame_masks[i + 1][0].cpu())
-    # Draw mask
-    annotated_frame_with_mask = draw_mask(final_mask, image_source)
-    # Dilate mask
     mask = final_mask.numpy()
     mask = mask.astype(np.uint8) * 255
     mask = dilate_mask(mask)
     dilated_image_mask_pil = Image.fromarray(mask)
-    # Inpainting
-    result = simple_lama(image_source, dilated_image_mask_pil)
-    # Difference and composite
-    diff = ImageChops.difference(result, Image.fromarray(image_source))
     threshold = 7
     diff2 = diff.convert('L').point(lambda p: 255 if p > threshold else 0).convert('1')
-    img3 = Image.new('RGB', Image.fromarray(image_source).size, (255, 236, 10))
-    diff3 = Image.composite(Image.fromarray(image_source), img3, diff2)
     return diff3

 import os
 import torch
 import numpy as np
 from PIL import Image, ImageChops, ImageEnhance
 import cv2
 from simple_lama_inpainting import SimpleLama
+from transformers import pipeline
 from huggingface_hub import hf_hub_download
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def load_groundingdino_model(device='cpu'):
+    model = pipeline(model="IDEA-Research/grounding-dino-base", task="zero-shot-object-detection", device=device)
     return model
+groundingdino_model = load_groundingdino_model(device=device)
+sam_predictor = None
 simple_lama = SimpleLama()
 def detect(image, model, text_prompt='insect . flower . cloud', box_threshold=0.15, text_threshold=0.15):
+    labels = [label if label.endswith('.') else label + '.' for label in text_prompt.split('.')]
+    results = model(image, candidate_labels=labels, threshold=box_threshold)
+    return results
 def segment(image, sam_model, boxes):
     sam_model.set_image(image)
     H, W, _ = image.shape
+    boxes_xyxy = torch.Tensor(boxes) * torch.Tensor([W, H, W, H])
     transformed_boxes = sam_model.transform.apply_boxes_torch(boxes_xyxy.to(device), image.shape[:2])
     masks, _, _ = sam_model.predict_torch(
         point_coords=None,
     )
     return mask
+def gsl_process_image(image):
+    image_source = Image.fromarray(image)
+    detected_boxes = detect(image_source, groundingdino_model)
+    boxes = [[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detected_boxes]
+    segmented_frame_masks = segment(image, sam_predictor, boxes)
     final_mask = None
     for i in range(len(segmented_frame_masks) - 1):
         if final_mask is None:
         else:
             final_mask = np.bitwise_or(final_mask, segmented_frame_masks[i + 1][0].cpu())
+    annotated_frame_with_mask = draw_mask(final_mask, image)
     mask = final_mask.numpy()
     mask = mask.astype(np.uint8) * 255
     mask = dilate_mask(mask)
     dilated_image_mask_pil = Image.fromarray(mask)
+    result = simple_lama(image, dilated_image_mask_pil)
+    diff = ImageChops.difference(result, Image.fromarray(image))
     threshold = 7
     diff2 = diff.convert('L').point(lambda p: 255 if p > threshold else 0).convert('1')
+    img3 = Image.new('RGB', Image.fromarray(image).size, (255, 236, 10))
+    diff3 = Image.composite(Image.fromarray(image), img3, diff2)
     return diff3