Spaces:

rxavier
/

off-topic-images

Runtime error

App Files Files Community

rxavier commited on Jul 13, 2023

Commit

1139c3b

•

1 Parent(s): 81502c7

Update off_topic.py

Browse files

Files changed (1) hide show

off_topic.py +64 -35

off_topic.py CHANGED Viewed

@@ -13,12 +13,16 @@ import imagehash
 from transformers import CLIPModel, CLIPProcessor
 from PIL import Image
 class OffTopicDetector:
-    def __init__(self, model_id: str, device: Optional[str] = None):
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.processor = CLIPProcessor.from_pretrained(model_id)
         self.model = CLIPModel.from_pretrained(model_id).to(self.device)
     def predict_probas(self, images: List[PIL.Image.Image], domain: str,
                 valid_templates: Optional[List[str]] = None,
@@ -35,48 +39,36 @@ class OffTopicDetector:
         print(f"Valid classes: {valid_classes}", f"Invalid classes: {invalid_classes}", sep="\n")
         n_classes = len(classes)
         start = time.time()
         inputs = self.processor(text=classes, images=images, return_tensors="pt", padding=True).to(self.device)
         if self.device == "cpu" and autocast is True:
-            print("Disabling autocast due to device='cpu'.")
             autocast = False
         with torch.autocast(self.device, enabled=autocast):
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 probas = outputs.logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
         end = time.time()
         duration = end - start
-        print(f"Device: {self.device}",
-              f"Response time: {duration}s",
-              f"Response time per image: {round(duration/len(images), 2) * 1000}ms",
               sep="\n")
         valid_probas = probas[:, 0:n_valid].sum(axis=1, keepdims=True)
         invalid_probas = probas[:, n_valid:n_classes].sum(axis=1, keepdims=True)
         return probas, valid_probas, invalid_probas
-    def show(self, images: List[PIL.Image.Image], valid_probas: np.ndarray, n_cols: int = 3, title: Optional[str] = None, threshold: Optional[float] = None):
-        if threshold is not None:
-            prediction = self.apply_threshold(valid_probas, threshold)
-            title_scores = [f"Valid: {pred.squeeze()}" for pred in prediction]
-        else:
-            prediction = np.round(valid_probas[:, 0], 2)
-            title_scores = [f"Valid: {pred:.2f}" for pred in prediction]
-        n_images = len(images)
-        n_rows = int(np.ceil(n_images / n_cols))
-        fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 16))
-        for i, ax in enumerate(axes.ravel()):
-            ax.axis("off")
-            try:
-                ax.imshow(images[i])
-                ax.set_title(title_scores[i])
-            except IndexError:
-                continue
-        if title:
-            fig.suptitle(title)
-        fig.tight_layout()
-        return
-    def predict_item_probas(self, url_or_id: str,
                 valid_templates: Optional[List[str]] = None,
                 invalid_classes: Optional[List[str]] = None):
         images, domain = self.get_item_data(url_or_id)
@@ -92,24 +84,38 @@ class OffTopicDetector:
             item_id = "".join(url_or_id.split("/")[3].split("-")[:2])
         else:
             item_id = re.sub("-", "", url_or_id)
         response = httpx.get(f"https://api.mercadolibre.com/items/{item_id}").json()
         domain = re.sub("_", " ", response["domain_id"].split("-")[-1]).lower()
         img_urls = [x["url"] for x in response["pictures"]]
         images = self.get_images(img_urls)
-        hashes = {}
-        for img in images:
-            hashes.update({str(imagehash.average_hash(img)): img})
-        dedup_hashes = list(dict.fromkeys(hashes))
-        dedup_images = [img for hash, img in hashes.items() if hash in dedup_hashes]
         return dedup_images, domain
     def get_images(self, urls: List[str]):
         start = time.time()
         images = asyncio.run(self._gather_download_tasks(urls))
         end = time.time()
         duration = end - start
-        print(f"Download time: {duration}s",
-              f"Download time per image: {round(duration/len(urls), 2) * 1000}ms",
               sep="\n")
         return asyncio.run(self._gather_download_tasks(urls))
@@ -139,4 +145,27 @@ class OffTopicDetector:
             if save_images:
                 with open(re.sub("D_NQ_NP_", "", img_url.split("/")[-1]) , "wb") as f:
                     f.write(img.content)
-        return images, domain

 from transformers import CLIPModel, CLIPProcessor
 from PIL import Image
+import nest_asyncio
+nest_asyncio.apply()
 class OffTopicDetector:
+    def __init__(self, model_id: str, device: Optional[str] = None, image_size: str = "E"):
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.processor = CLIPProcessor.from_pretrained(model_id)
         self.model = CLIPModel.from_pretrained(model_id).to(self.device)
+        self.image_size = image_size
     def predict_probas(self, images: List[PIL.Image.Image], domain: str,
                 valid_templates: Optional[List[str]] = None,
         print(f"Valid classes: {valid_classes}", f"Invalid classes: {invalid_classes}", sep="\n")
         n_classes = len(classes)
+        if self.device == "cuda":
+            torch.cuda.synchronize()
         start = time.time()
         inputs = self.processor(text=classes, images=images, return_tensors="pt", padding=True).to(self.device)
         if self.device == "cpu" and autocast is True:
             autocast = False
         with torch.autocast(self.device, enabled=autocast):
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 probas = outputs.logits_per_image.softmax(dim=1).cpu().numpy() # we can take the softmax to get the label probabilities
+        if self.device == "cuda":
+            torch.cuda.synchronize()
         end = time.time()
         duration = end - start
+        print(f"Model time: {round(duration, 2)} s",
+              f"Model time per image: {round(duration/len(images) * 1000, 0)} ms",
               sep="\n")
         valid_probas = probas[:, 0:n_valid].sum(axis=1, keepdims=True)
         invalid_probas = probas[:, n_valid:n_classes].sum(axis=1, keepdims=True)
         return probas, valid_probas, invalid_probas
+    def predict_probas_url(self, img_urls: List[str], domain: str,
+                valid_templates: Optional[List[str]] = None,
+                invalid_classes: Optional[List[str]] = None,
+                autocast: bool = True):
+        images = self.get_images(img_urls)
+        dedup_images = self._filter_dups(images)
+        return self.predict_probas(images, domain, valid_templates, invalid_classes, autocast)
+    def predict_probas_item(self, url_or_id: str,
                 valid_templates: Optional[List[str]] = None,
                 invalid_classes: Optional[List[str]] = None):
         images, domain = self.get_item_data(url_or_id)
             item_id = "".join(url_or_id.split("/")[3].split("-")[:2])
         else:
             item_id = re.sub("-", "", url_or_id)
+        start = time.time()
         response = httpx.get(f"https://api.mercadolibre.com/items/{item_id}").json()
         domain = re.sub("_", " ", response["domain_id"].split("-")[-1]).lower()
         img_urls = [x["url"] for x in response["pictures"]]
+        img_urls = [x.replace("-O.jpg", f"-{self.image_size}.jpg") for x in img_urls]
+        end = time.time()
+        duration = end - start
+        print(f"Items API time: {round(duration * 1000, 0)} ms")
         images = self.get_images(img_urls)
+        dedup_images = self._filter_dups(images)
         return dedup_images, domain
+    def _filter_dups(self, images: List):
+        if len(images) > 1:
+            hashes = {}
+            for img in images:
+                hashes.update({str(imagehash.average_hash(img)): img})
+            dedup_hashes = list(dict.fromkeys(hashes))
+            dedup_images = [img for hash, img in hashes.items() if hash in dedup_hashes]
+        else:
+            dedup_images = images
+        if (diff := len(images) - len(dedup_images)) > 0:
+            print(f"Filtered {diff} images out of {len(images)} due to matching hashes.")
+        return dedup_images
     def get_images(self, urls: List[str]):
         start = time.time()
         images = asyncio.run(self._gather_download_tasks(urls))
         end = time.time()
         duration = end - start
+        print(f"Download time: {round(duration, 2)} s",
+              f"Download time per image: {round(duration/len(urls) * 1000, 0)} ms",
               sep="\n")
         return asyncio.run(self._gather_download_tasks(urls))
             if save_images:
                 with open(re.sub("D_NQ_NP_", "", img_url.split("/")[-1]) , "wb") as f:
                     f.write(img.content)
+        return images, domain
+    def show(self, images: List[PIL.Image.Image], valid_probas: np.ndarray, n_cols: int = 3,
+             title: Optional[str] = None, threshold: Optional[float] = None):
+        if threshold is not None:
+            prediction = self.apply_threshold(valid_probas, threshold)
+            title_scores = [f"Valid: {pred.squeeze()}" for pred in prediction]
+        else:
+            prediction = np.round(valid_probas[:, 0], 2)
+            title_scores = [f"Valid: {pred:.2f}" for pred in prediction]
+        n_images = len(images)
+        n_rows = int(np.ceil(n_images / n_cols))
+        fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 16))
+        for i, ax in enumerate(axes.ravel()):
+            ax.axis("off")
+            try:
+                ax.imshow(images[i])
+                ax.set_title(title_scores[i])
+            except IndexError:
+                continue
+        if title:
+            fig.suptitle(title)
+        fig.tight_layout()
+        return