File size: 4,999 Bytes
2cd560a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import cv2
import numpy as np
import supervision as sv
from typing import List
from PIL import Image
import torch
from groundingdino.util.inference import Model
from segment_anything import sam_model_registry, SamPredictor
# Tag2Text
# from ram.models import tag2text_caption
from ram.models import ram
# from ram import inference_tag2text
from ram import inference_ram
import torchvision
import torchvision.transforms as TS
# Hyper-Params
SOURCE_IMAGE_PATH = "./assets/demo9.jpg"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"
SAM_ENCODER_VERSION = "vit_h"
SAM_CHECKPOINT_PATH = "./sam_vit_h_4b8939.pth"
TAG2TEXT_CHECKPOINT_PATH = "./tag2text_swin_14m.pth"
RAM_CHECKPOINT_PATH = "./ram_swin_large_14m.pth"
TAG2TEXT_THRESHOLD = 0.64
BOX_THRESHOLD = 0.2
TEXT_THRESHOLD = 0.2
IOU_THRESHOLD = 0.5
# Building GroundingDINO inference model
grounding_dino_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)
# Building SAM Model and SAM Predictor
sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH)
sam_predictor = SamPredictor(sam)
# Tag2Text
# initialize Tag2Text
normalize = TS.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
transform = TS.Compose(
[
TS.Resize((384, 384)),
TS.ToTensor(),
normalize
]
)
DELETE_TAG_INDEX = [] # filter out attributes and action which are difficult to be grounded
for idx in range(3012, 3429):
DELETE_TAG_INDEX.append(idx)
# tag2text_model = tag2text_caption(
# pretrained=TAG2TEXT_CHECKPOINT_PATH,
# image_size=384,
# vit='swin_b',
# delete_tag_index=DELETE_TAG_INDEX
# )
# # threshold for tagging
# # we reduce the threshold to obtain more tags
# tag2text_model.threshold = TAG2TEXT_THRESHOLD
# tag2text_model.eval()
# tag2text_model = tag2text_model.to(DEVICE)
ram_model = ram(pretrained=RAM_CHECKPOINT_PATH,
image_size=384,
vit='swin_l')
ram_model.eval()
ram_model = ram_model.to(DEVICE)
# load image
image = cv2.imread(SOURCE_IMAGE_PATH) # bgr
image_pillow = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # rgb
image_pillow = image_pillow.resize((384, 384))
image_pillow = transform(image_pillow).unsqueeze(0).to(DEVICE)
specified_tags='None'
# res = inference_tag2text(image_pillow , tag2text_model, specified_tags)
res = inference_ram(image_pillow , ram_model)
# Currently ", " is better for detecting single tags
# while ". " is a little worse in some case
AUTOMATIC_CLASSES=res[0].split(" | ")
print(f"Tags: {res[0].replace(' |', ',')}")
# detect objects
detections = grounding_dino_model.predict_with_classes(
image=image,
classes=AUTOMATIC_CLASSES,
box_threshold=BOX_THRESHOLD,
text_threshold=BOX_THRESHOLD
)
# NMS post process
print(f"Before NMS: {len(detections.xyxy)} boxes")
nms_idx = torchvision.ops.nms(
torch.from_numpy(detections.xyxy),
torch.from_numpy(detections.confidence),
IOU_THRESHOLD
).numpy().tolist()
detections.xyxy = detections.xyxy[nms_idx]
detections.confidence = detections.confidence[nms_idx]
detections.class_id = detections.class_id[nms_idx]
print(f"After NMS: {len(detections.xyxy)} boxes")
# annotate image with detections
box_annotator = sv.BoxAnnotator()
labels = [
f"{AUTOMATIC_CLASSES[class_id]} {confidence:0.2f}"
for _, _, confidence, class_id, _
in detections]
annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections, labels=labels)
# save the annotated grounding dino image
cv2.imwrite("groundingdino_auto_annotated_image.jpg", annotated_frame)
# Prompting SAM with detected boxes
def segment(sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
sam_predictor.set_image(image)
result_masks = []
for box in xyxy:
masks, scores, logits = sam_predictor.predict(
box=box,
multimask_output=True
)
index = np.argmax(scores)
result_masks.append(masks[index])
return np.array(result_masks)
# convert detections to masks
detections.mask = segment(
sam_predictor=sam_predictor,
image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
xyxy=detections.xyxy
)
# annotate image with detections
box_annotator = sv.BoxAnnotator()
mask_annotator = sv.MaskAnnotator()
labels = [
f"{AUTOMATIC_CLASSES[class_id]} {confidence:0.2f}"
for _, _, confidence, class_id, _
in detections]
annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
# save the annotated grounded-sam image
cv2.imwrite("ram_grounded_sam_auto_annotated_image.jpg", annotated_image) |