from cnocr import CnOcr def string_similarity(s1, s2): # Levenshtein distance algorithm s1 = s1.replace(' ','') s1 = s1.lower() s2 = s2.replace(' ','') s2 = s2.lower() if s1 == s2: return 100.0 len1 = len(s1) len2 = len(s2) matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)] for i in range(len1 + 1): matrix[i][0] = i for j in range(len2 + 1): matrix[0][j] = j for i in range(1, len1 + 1): for j in range(1, len2 + 1): if s1[i - 1] == s2[j - 1]: cost = 0 else: cost = 1 matrix[i][j] = min(matrix[i - 1][j] + 1, # deletion matrix[i][j - 1] + 1, # insertion matrix[i - 1][j - 1] + cost) # substitution similarity = (1 - matrix[len1][len2] / max(len1, len2)) * 100 return round(similarity, 1) def is_good_subsequence(s1, s2): len_s2 = len(s2) len_s1 = len(s1) s1 = s1.lower() s2 = s2.lower() if len_s2 > len_s1 + 10: return False # Initialize variables for counting matches match_count = 0 s1_index = 0 # Iterate over each character in s2 for char in s2: # Search for the character in s1 starting from the last matched index while s1_index < len_s1: if s1[s1_index] == char: match_count += 1 s1_index += 1 break s1_index += 1 # Check if the match count is more than 70% of s2 length return match_count >= (0.5 * len_s2) def check_hkid(path): ocr = CnOcr(rec_model_name='en_PP-OCRv3') # ocr = CnOcr(rec_model_name='densenet_lite_136-fc') out = ocr.ocr(path) for data in out: text = data['text'] if string_similarity('HONGKONGPERMANENTIDENTITYCARD', text) > 60: return True return False # print(check_hkid('image/hkid.jpg'))