hf-similarity-check / check_hkid_validity.py
Mitul Mohammad Abdullah Al Mukit
update
9312707
raw
history blame contribute delete
No virus
2 kB
from cnocr import CnOcr
def string_similarity(s1, s2): # Levenshtein distance algorithm
s1 = s1.replace(' ','')
s1 = s1.lower()
s2 = s2.replace(' ','')
s2 = s2.lower()
if s1 == s2:
return 100.0
len1 = len(s1)
len2 = len(s2)
matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
for i in range(len1 + 1):
matrix[i][0] = i
for j in range(len2 + 1):
matrix[0][j] = j
for i in range(1, len1 + 1):
for j in range(1, len2 + 1):
if s1[i - 1] == s2[j - 1]:
cost = 0
else:
cost = 1
matrix[i][j] = min(matrix[i - 1][j] + 1, # deletion
matrix[i][j - 1] + 1, # insertion
matrix[i - 1][j - 1] + cost) # substitution
similarity = (1 - matrix[len1][len2] / max(len1, len2)) * 100
return round(similarity, 1)
def is_good_subsequence(s1, s2):
len_s2 = len(s2)
len_s1 = len(s1)
s1 = s1.lower()
s2 = s2.lower()
if len_s2 > len_s1 + 10:
return False
# Initialize variables for counting matches
match_count = 0
s1_index = 0
# Iterate over each character in s2
for char in s2:
# Search for the character in s1 starting from the last matched index
while s1_index < len_s1:
if s1[s1_index] == char:
match_count += 1
s1_index += 1
break
s1_index += 1
# Check if the match count is more than 70% of s2 length
return match_count >= (0.5 * len_s2)
def check_hkid(path):
ocr = CnOcr(rec_model_name='en_PP-OCRv3')
# ocr = CnOcr(rec_model_name='densenet_lite_136-fc')
out = ocr.ocr(path)
for data in out:
text = data['text']
if string_similarity('HONGKONGPERMANENTIDENTITYCARD', text) > 60:
return True
return False
# print(check_hkid('image/hkid.jpg'))