OWL-ViT

Runtime error

App Files Files Community

kellyxiaowei commited on Jun 26, 2023

Commit

dc8253a

•

1 Parent(s): 88274ef

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -30

app.py CHANGED Viewed

@@ -2,75 +2,78 @@ import torch
 import cv2
 import gradio as gr
 import numpy as np
-from transformers import OwlViTProcessor, OwlViTForObjectDetection
 import requests
-# 如果GPU可用，就使用GPU，否则使用CPU
 if torch.cuda.is_available():
     device = torch.device("cuda")
 else:
     device = torch.device("cpu")
-# 从预训练模型"google/owlvit-large-patch14"加载OWL-ViT模型，并将其放置到适当的设备上
 model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)
 model.eval()
-# 从同一预训练模型中加载处理器
 processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")
-# 定义一个函数来处理图像URL，文本查询和分数阈值
 def query_image(img_url, text_queries, score_threshold):
-    # 使用requests库从URL中获取图像
-    response = requests.get(img_url)
-    response.raise_for_status()
-    arr = np.asarray(bytearray(response.content), dtype=np.uint8)
-    img = cv2.imdecode(arr, -1)  # 使用-1来加载原始图像
-    text_queries = text_queries.split(",")  # 将文本查询分割成独立的查询
     target_sizes = torch.Tensor([img.shape[:2]])
-    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)  # 使用处理器创建模型的输入
     with torch.no_grad():
-        outputs = model(**inputs)  # 获取模型的输出
-    # 将输出转移到CPU上
     outputs.logits = outputs.logits.cpu()
     outputs.pred_boxes = outputs.pred_boxes.cpu()
-    # 使用处理器进行后处理
     results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     font = cv2.FONT_HERSHEY_SIMPLEX
-    # 在图像上绘制边界框并添加标签
     for box, score, label in zip(boxes, scores, labels):
         box = [int(i) for i in box.tolist()]
         if score >= score_threshold:
             img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 5)
-            y = box[3] - 10 if box[3] + 25 > 768 else box[3] + 25
             img = cv2.putText(
                 img, text_queries[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
             )
     return img
 description = """
-Gradio demo for OWL-ViT.
-You can use OWL-ViT to query images with text descriptions of any object.
-To use it, simply provide an image URL and enter comma separated text descriptions of objects you want to query the image for.
-You can also use the score threshold slider to set a threshold to filter out low probability predictions.
-"""
-# 创建一个Gradio界面
 demo = gr.Interface(
     query_image,
-    inputs=["text", "text", gr.Slider(0, 1, value=0.1)],  # 修改输入，使其接受URL而不是图像
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,
-    examples=[],  # 设置为一个空列表
 )
-demo.launch()

 import cv2
 import gradio as gr
 import numpy as np
 import requests
+from PIL import Image
+from io import BytesIO
+from transformers import OwlViTProcessor, OwlViTForObjectDetection
+# Use GPU if available
 if torch.cuda.is_available():
     device = torch.device("cuda")
 else:
     device = torch.device("cpu")
 model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)
 model.eval()
 processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")
 def query_image(img_url, text_queries, score_threshold):
+    text_queries = text_queries.split(",")
+    response = requests.get(img_url)
+    img = Image.open(BytesIO(response.content))
+    img = np.array(img)
     target_sizes = torch.Tensor([img.shape[:2]])
+    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
     with torch.no_grad():
+        outputs = model(**inputs)
     outputs.logits = outputs.logits.cpu()
     outputs.pred_boxes = outputs.pred_boxes.cpu()
     results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
         box = [int(i) for i in box.tolist()]
         if score >= score_threshold:
             img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 5)
+            if box[3] + 25 > 768:
+                y = box[3] - 10
+            else:
+                y = box[3] + 25
             img = cv2.putText(
                 img, text_queries[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
             )
     return img
 description = """
+Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">OWL-ViT</a>,
+introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
+with Vision Transformers</a>.
+\n\nYou can use OWL-ViT to query images with text descriptions of any object.
+To use it, simply input the URL of an image and enter comma separated text descriptions of objects you want to query the image for. You
+can also use the score threshold slider to set a threshold to filter out low probability predictions.
+\n\nOWL-ViT is trained on text templates,
+hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*,
+*"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
+\n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
+"""
 demo = gr.Interface(
     query_image,
+    inputs=["text", "text", gr.Slider(0, 1, value=0.1)],
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,
+    examples=[],
 )
+demo.launch()