Spaces:

OneFi
/

hf-similarity-check

Sleeping

File size: 6,354 Bytes

1f72938

import re

def check_integer(string):
    if string.isdigit():
        return True

    for char in string:
        if char.isdigit():
            return True
    return False

def check_alpha(string):
    for char in string:
        if not ((char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z') or char == ' '):
            return False
    return True

def is_chinese_name(text):
    substrings = [text[:1], text[:2], text[:3], text[:4], text[:5], text[:6], text[:7], text[:8]]

    if len(text) > 40:
        return False
    
    for substring in substrings:
        upper_case_sum = 0
        lower_case_sum = 0
        space = 0
        for char in substring:
            if char >= 'A' and char <= 'Z':
                upper_case_sum += 1
            if char >= 'a' and char <= 'z':
                lower_case_sum += 1
            if char == ' ':
                space += 1
        if upper_case_sum >= 3 and lower_case_sum >= 2 and space >= 1:
            return True
    
    return False

def seperate_name(text):
    word1 = ""
    word2 = ""
    word3 = ""
    name = text.replace(' ', '')
    # l = 0
    # space = 0
    # for char in text:
    #     if char >= 'A' and char <= 'Z':
    #         l += 1
    #     if char != ' ':
    #         space += 1
    #     else:
    #         word2 = text[l-1:space]
    #         word3 = text[space+1::]
    # word1 = text[:l - 2]

    # # only two characters
    # if space == len(text):
    #     word1 = text[:l-1]
    #     word2 = text[l-1::]
    #     name = word1 + ' ' + word2
    # else:
    #     name = word1 + ' ' + word2 + ' ' + word3
    return name.lower()

def validate_hkid(hkid): # omit parentheses
    hkid = hkid.replace('(', '').replace(')', '')

    weight = [9, 8, 7, 6, 5, 4, 3, 2, 1]
    values = list('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ') + [None]

    match = re.match('^([A-Z])?([A-Z])([0-9]{6})([0-9A])$', hkid)
    if not match: return False

    hkidArr = []
    for g in match.groups():
        hkidArr += list(g) if g else [g]

    r = sum([values.index(i) * w for i, w in zip(hkidArr, weight)]) % 11

    return r == 0

def format_HKID(hkid):
    hkid = hkid.replace('(', '').replace(')', '')
    idlen = len(hkid)
    
    match = re.match('^([A-Z])?([A-Z])([0-9]{6})([0-9A])$', hkid)

    hkidArr = []
    for g in match.groups():
        hkidArr += list(g) if g else [g]

    formatted_hkid = ''

    index = 0
    for char in hkidArr:
        if char != None:
            formatted_hkid += char
        if index == idlen - 1:
            formatted_hkid += '('
        if index == idlen:
            formatted_hkid += ')'
        index += 1

    return formatted_hkid

def format_issuedate(issuedate):
    formatted_issuedate = issuedate.replace('(', '').replace(')', '')
    formatted_issuedate = formatted_issuedate.replace('C', '')
    return formatted_issuedate

def is_string_integer(string):
    try:
        int(string)  # Attempt to convert the string to an integer
        return True  # If successful, the string only contains integers
    except ValueError:
        return False  # If a ValueError occurs, the string doesn't only contain integers

def check_issuedate(text):
    if len(text) < 5 and len(text) > 7 :
        return False
    if len(text) > 0 and text[0] == '(':
        text = text.replace('(', '')
    elif len(text) > 0 and text[0] == 'C':
        text = text.replace('C', '')
    if len(text) > 0 and text[-1] == ')':
        text = text.replace(')', '')
    if len(text) != 5:
        return False
    if text[2] != '-':
        return False
    text = text.replace('-', '')
    if not is_string_integer(text):
        return False
    return True

def print_info(name, valid_hkid, hkid, issuedate):
    print(f'Name: {name}')
    print(f'HKID: {hkid} and validity: {valid_hkid}')
    print(f'Date of issue: {issuedate}')

def is_comma_present(string):
    return ',' in string

def longest_common_subsequence(s1, s2):
    m, n = len(s1), len(s2)
    # Create a 2D table to store the lengths of common subsequences
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Build the table in a bottom-up manner
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    # Retrieve the longest common subsequence
    lcs = []
    i, j = m, n
    while i > 0 and j > 0:
        if s1[i - 1] == s2[j - 1]:
            lcs.append(s1[i - 1])
            i -= 1
            j -= 1
        elif dp[i - 1][j] > dp[i][j - 1]:
            i -= 1
        else:
            j -= 1

    # Reverse the sequence to get the correct order
    lcs.reverse()
    return ''.join(lcs)

def combine_info(info1, info2):
    combined_info = []

    print(info1)
    print(info2)

    if info1[0] == info2[0]:
        combined_info.append(info1[0])  # Append the variable as-is if it's the same in both models
    elif info1[0] == '':
        combined_info.append(info2[0])
    elif info2[0] == '':
        combined_info.append(info1[0])
    else:
        subseq = longest_common_subsequence(info1[0], info2[0])
        combined_info.append(subseq)

    if info1[1] == 'True' and info2[1] == 'False':
        combined_info.append(info1[1])
        combined_info.append(info1[2])
    elif info1[1] == 'False' and info2[1] == 'True':
        combined_info.append(info2[1])
        combined_info.append(info2[2])
    elif info1[1] == 'True' and info2[1] == 'True':
        if info1[2] == info2[2]:
            combined_info.append(info1[1])
            combined_info.append(info1[2])
    else:
        combined_info.append('False')
        combined_info.append('Suspicous HKID')

    if info1[3] == info2[3]:
        combined_info.append(info1[3])
    else:
        combined_info.append('Unmatched issuedate')

    # print(combined_info)

    return combined_info



# info1 = ['', 'True', 'Z683365(5)', '06-96']
# info2 = ['lok wing', 'False', 'Z68336505)', '06-96']
# info = combine_info(info1, info2)
# print_info(*info)


# text = 'TAMKing Man'
# if is_comma_present(text):
#             text = text.replace(',', '')
#             if not check_integer(text):
#                 if check_alpha(text) and is_chinese_name(text):
#                     name = seperate_name(text)