Spaces:

OneFi
/

hf-similarity-check

Sleeping

File size: 1,998 Bytes
from cnocr import CnOcr

def string_similarity(s1, s2): # Levenshtein distance algorithm

    s1 = s1.replace(' ','')
    s1 = s1.lower()
    s2 = s2.replace(' ','')
    s2 = s2.lower()

    if s1 == s2:
        return 100.0
    
    len1 = len(s1)
    len2 = len(s2)
    matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]

    for i in range(len1 + 1):
        matrix[i][0] = i
    
    for j in range(len2 + 1):
        matrix[0][j] = j
    
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            if s1[i - 1] == s2[j - 1]:
                cost = 0
            else:
                cost = 1
            matrix[i][j] = min(matrix[i - 1][j] + 1,          # deletion
                               matrix[i][j - 1] + 1,          # insertion
                               matrix[i - 1][j - 1] + cost)   # substitution
    
    similarity = (1 - matrix[len1][len2] / max(len1, len2)) * 100
    return round(similarity, 1)

def is_good_subsequence(s1, s2):
    len_s2 = len(s2)
    len_s1 = len(s1)

    s1 = s1.lower()
    s2 = s2.lower()
    
    if len_s2 > len_s1 + 10:
        return False
    
    # Initialize variables for counting matches
    match_count = 0
    s1_index = 0
    
    # Iterate over each character in s2
    for char in s2:
        # Search for the character in s1 starting from the last matched index
        while s1_index < len_s1:
            if s1[s1_index] == char:
                match_count += 1
                s1_index += 1
                break
            s1_index += 1
    
    # Check if the match count is more than 70% of s2 length
    return match_count >= (0.5 * len_s2)

def check_hkid(path):
    ocr = CnOcr(rec_model_name='en_PP-OCRv3')
    # ocr = CnOcr(rec_model_name='densenet_lite_136-fc')
    out = ocr.ocr(path)

    for data in out:
        text = data['text']

        if string_similarity('HONGKONGPERMANENTIDENTITYCARD', text) > 60:
            return True

    return False

# print(check_hkid('image/hkid.jpg'))