TangJicheng
chore: init
7671dfd
raw
history blame contribute delete
No virus
8.11 kB
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import math
import numpy as np
import cv2
import tritonclient.http as httpclient
SAVE_INTERMEDIATE_IMAGES = False
def detection_preprocessing(image: cv2.Mat) -> np.ndarray:
inpWidth = 640
inpHeight = 480
# pre-process image
blob = cv2.dnn.blobFromImage(
image, 1.0, (inpWidth, inpHeight), (123.68, 116.78, 103.94), True, False
)
blob = np.transpose(blob, (0, 2, 3, 1))
return blob
def detection_postprocessing(scores, geometry, preprocessed_image):
def fourPointsTransform(frame, vertices):
vertices = np.asarray(vertices)
outputSize = (100, 32)
targetVertices = np.array(
[
[0, outputSize[1] - 1],
[0, 0],
[outputSize[0] - 1, 0],
[outputSize[0] - 1, outputSize[1] - 1],
],
dtype="float32",
)
rotationMatrix = cv2.getPerspectiveTransform(vertices, targetVertices)
result = cv2.warpPerspective(frame, rotationMatrix, outputSize)
return result
def decodeBoundingBoxes(scores, geometry, scoreThresh=0.5):
detections = []
confidences = []
############ CHECK DIMENSIONS AND SHAPES OF geometry AND scores ########
assert len(scores.shape) == 4, "Incorrect dimensions of scores"
assert len(geometry.shape) == 4, "Incorrect dimensions of geometry"
assert scores.shape[0] == 1, "Invalid dimensions of scores"
assert geometry.shape[0] == 1, "Invalid dimensions of geometry"
assert scores.shape[1] == 1, "Invalid dimensions of scores"
assert geometry.shape[1] == 5, "Invalid dimensions of geometry"
assert (
scores.shape[2] == geometry.shape[2]
), "Invalid dimensions of scores and geometry"
assert (
scores.shape[3] == geometry.shape[3]
), "Invalid dimensions of scores and geometry"
height = scores.shape[2]
width = scores.shape[3]
for y in range(0, height):
# Extract data from scores
scoresData = scores[0][0][y]
x0_data = geometry[0][0][y]
x1_data = geometry[0][1][y]
x2_data = geometry[0][2][y]
x3_data = geometry[0][3][y]
anglesData = geometry[0][4][y]
for x in range(0, width):
score = scoresData[x]
# If score is lower than threshold score, move to next x
if score < scoreThresh:
continue
# Calculate offset
offsetX = x * 4.0
offsetY = y * 4.0
angle = anglesData[x]
# Calculate cos and sin of angle
cosA = math.cos(angle)
sinA = math.sin(angle)
h = x0_data[x] + x2_data[x]
w = x1_data[x] + x3_data[x]
# Calculate offset
offset = [
offsetX + cosA * x1_data[x] + sinA * x2_data[x],
offsetY - sinA * x1_data[x] + cosA * x2_data[x],
]
# Find points for rectangle
p1 = (-sinA * h + offset[0], -cosA * h + offset[1])
p3 = (-cosA * w + offset[0], sinA * w + offset[1])
center = (0.5 * (p1[0] + p3[0]), 0.5 * (p1[1] + p3[1]))
detections.append((center, (w, h), -1 * angle * 180.0 / math.pi))
confidences.append(float(score))
# Return detections and confidences
return [detections, confidences]
scores = scores.transpose(0, 3, 1, 2)
geometry = geometry.transpose(0, 3, 1, 2)
frame = np.squeeze(preprocessed_image, axis=0)
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
[boxes, confidences] = decodeBoundingBoxes(scores, geometry)
indices = cv2.dnn.NMSBoxesRotated(boxes, confidences, 0.5, 0.4)
cropped_list = []
cv2.imwrite("frame.png", frame)
count = 0
for i in indices:
# get 4 corners of the rotated rect
count += 1
vertices = cv2.boxPoints(boxes[i])
cropped = fourPointsTransform(frame, vertices)
cv2.imwrite(str(count) + ".png", cropped)
cropped = np.expand_dims(cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY), axis=0)
cropped_list.append(((cropped / 255.0) - 0.5) * 2)
cropped_arr = np.stack(cropped_list, axis=0)
# Only keep the first image, since the models don't currently allow batching.
# See part 2 for enabling batch sizes > 0
return cropped_arr[None, 0]
def recognition_postprocessing(scores: np.ndarray) -> str:
text = ""
alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
scores = np.transpose(scores, (1,0,2))
for i in range(scores.shape[0]):
c = np.argmax(scores[i][0])
if c != 0:
text += alphabet[c - 1]
else:
text += "-"
# adjacent same letters as well as background text must be removed
# to get the final output
char_list = []
for i, char in enumerate(text):
if char != "-" and (not (i > 0 and char == text[i - 1])):
char_list.append(char)
return "".join(char_list)
if __name__ == "__main__":
# Setting up client
client = httpclient.InferenceServerClient(url="localhost:8000")
# Read image and create input object
raw_image = cv2.imread("./img1.jpg")
preprocessed_image = detection_preprocessing(raw_image)
detection_input = httpclient.InferInput(
"input_images:0", preprocessed_image.shape, datatype="FP32"
)
detection_input.set_data_from_numpy(preprocessed_image, binary_data=True)
# Query the server
detection_response = client.infer(
model_name="text_detection", inputs=[detection_input]
)
# Process responses from detection model
scores = detection_response.as_numpy("feature_fusion/Conv_7/Sigmoid:0")
geometry = detection_response.as_numpy("feature_fusion/concat_3:0")
cropped_images = detection_postprocessing(scores, geometry, preprocessed_image)
# Create input object for recognition model
recognition_input = httpclient.InferInput(
"input.1", cropped_images.shape, datatype="FP32"
)
recognition_input.set_data_from_numpy(cropped_images, binary_data=True)
# Query the server
recognition_response = client.infer(
model_name="text_recognition", inputs=[recognition_input]
)
# Process response from recognition model
final_text = recognition_postprocessing(recognition_response.as_numpy("308"))
print(final_text)