Spaces:

BAAI
/

tokenize-anything

Sleeping

App Files Files Community

PhyscalX commited on Dec 18, 2023

Commit

ae507fe

•

1 Parent(s): 7c01d17

Use a safer process for submission

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -44,9 +44,9 @@ class Predictor(object):
     def __init__(self, model, kwargs):
         self.model = model
         self.kwargs = kwargs
-        self.batch_size = kwargs.get("batch_size", 256)
         self.model.concept_projector.reset_weights(kwargs["concept_weights"])
-        self.model.text_decoder.reset_cache(max_batch_size=self.batch_size)
     def preprocess_images(self, imgs):
         """Preprocess the inference images."""
@@ -85,21 +85,21 @@ class Predictor(object):
         mask_index = np.arange(rank_scores.shape[0]), rank_scores.argmax(1)
         iou_scores = outputs["iou_pred"][mask_index].cpu().numpy().reshape(batch_shape)
         # Upscale masks to the original image resolution.
-        mask_pred = outputs["mask_pred"][mask_index][:, None]
         mask_pred = self.model.upscale_masks(mask_pred, im_batch.shape[1:-1])
         mask_pred = mask_pred.view(batch_shape + mask_pred.shape[2:])
         # Predict concepts.
         concepts, scores = self.model.predict_concept(outputs["sem_embeds"][mask_index])
         concepts, scores = [x.reshape(batch_shape) for x in (concepts, scores)]
         # Generate captions.
-        sem_tokens = outputs["sem_tokens"][mask_index][:, None, :]
         captions = self.model.generate_text(sem_tokens).reshape(batch_shape)
         # Postprecess results.
         results = []
         for i in range(batch_shape[0]):
             pred_h, pred_w = im_info[i, :2].astype("int")
             masks = mask_pred[i : i + 1, :, :pred_h, :pred_w]
-            masks = self.model.upscale_masks(masks, imgs[i].shape[:2])[0]
             results.append(
                 {
                     "scores": np.stack([iou_scores[i], scores[i]], axis=-1),
@@ -165,7 +165,8 @@ def build_gradio_app(queues, command):
         return click_img, draw_img, anno_img
     def on_submit_btn(click_img, mask_img, prompt, multipoint):
-        if prompt == 0:
             img, points = click_img["image"], click_img["points"]
             points = np.array(points).reshape((-1, 2, 3))
             if multipoint == 1:
@@ -175,7 +176,7 @@ def build_gradio_app(queues, command):
                 poly = points[np.where(points[:, 2] <= 1)[0]][None, :, :]
                 points = [lt, rb, poly] if len(lt) > 0 else [poly, np.array([[[0, 0, 4]]])]
                 points = np.concatenate(points, axis=1)
-        elif prompt == 1:
             img, points = mask_img["background"], []
             for layer in mask_img["layers"]:
                 ys, xs = np.nonzero(layer[:, :, 0])
@@ -189,8 +190,8 @@ def build_gradio_app(queues, command):
                 points = np.concatenate([points, pad_points], axis=1)
         img = img[:, :, (2, 1, 0)] if img is not None else img
         img = np.zeros((480, 640, 3), dtype="uint8") if img is None else img
-        points = (np.array([[[0, 0, 4]]]) if len(points) == 0 else points).astype("float32")
-        inputs = {"img": img, "points": points}
         with command.output_index.get_lock():
             command.output_index.value += 1
             img_id = command.output_index.value

     def __init__(self, model, kwargs):
         self.model = model
         self.kwargs = kwargs
+        self.prompt_size = kwargs.get("prompt_size", 256)
         self.model.concept_projector.reset_weights(kwargs["concept_weights"])
+        self.model.text_decoder.reset_cache(max_batch_size=self.prompt_size)
     def preprocess_images(self, imgs):
         """Preprocess the inference images."""
         mask_index = np.arange(rank_scores.shape[0]), rank_scores.argmax(1)
         iou_scores = outputs["iou_pred"][mask_index].cpu().numpy().reshape(batch_shape)
         # Upscale masks to the original image resolution.
+        mask_pred = outputs["mask_pred"][mask_index].unsqueeze_(1)
         mask_pred = self.model.upscale_masks(mask_pred, im_batch.shape[1:-1])
         mask_pred = mask_pred.view(batch_shape + mask_pred.shape[2:])
         # Predict concepts.
         concepts, scores = self.model.predict_concept(outputs["sem_embeds"][mask_index])
         concepts, scores = [x.reshape(batch_shape) for x in (concepts, scores)]
         # Generate captions.
+        sem_tokens = outputs["sem_tokens"][mask_index].unsqueeze_(1)
         captions = self.model.generate_text(sem_tokens).reshape(batch_shape)
         # Postprecess results.
         results = []
         for i in range(batch_shape[0]):
             pred_h, pred_w = im_info[i, :2].astype("int")
             masks = mask_pred[i : i + 1, :, :pred_h, :pred_w]
+            masks = self.model.upscale_masks(masks, imgs[i].shape[:2]).flatten(0, 1)
             results.append(
                 {
                     "scores": np.stack([iou_scores[i], scores[i]], axis=-1),
         return click_img, draw_img, anno_img
     def on_submit_btn(click_img, mask_img, prompt, multipoint):
+        img, points = None, np.array([[[0, 0, 4]]])
+        if prompt == 0 and click_img is not None:
             img, points = click_img["image"], click_img["points"]
             points = np.array(points).reshape((-1, 2, 3))
             if multipoint == 1:
                 poly = points[np.where(points[:, 2] <= 1)[0]][None, :, :]
                 points = [lt, rb, poly] if len(lt) > 0 else [poly, np.array([[[0, 0, 4]]])]
                 points = np.concatenate(points, axis=1)
+        elif prompt == 1 and mask_img is not None:
             img, points = mask_img["background"], []
             for layer in mask_img["layers"]:
                 ys, xs = np.nonzero(layer[:, :, 0])
                 points = np.concatenate([points, pad_points], axis=1)
         img = img[:, :, (2, 1, 0)] if img is not None else img
         img = np.zeros((480, 640, 3), dtype="uint8") if img is None else img
+        points = np.array([[[0, 0, 4]]]) if (len(points) == 0 or points.size == 0) else points
+        inputs = {"img": img, "points": points.astype("float32")}
         with command.output_index.get_lock():
             command.output_index.value += 1
             img_id = command.output_index.value