|
import pandas as pd |
|
import re |
|
|
|
|
|
def filter_rows_containing_all_keywords(df, keywords): |
|
special_chars = r".^$*+?{}[]\|()" |
|
|
|
final_mask = pd.Series([True] * len(df), index=df.index) |
|
|
|
|
|
for keyword in keywords: |
|
keyword_mask = pd.Series([False] * len(df), index=df.index) |
|
request_regex = False |
|
if any(char in keyword for char in special_chars): |
|
keyword = re.escape(keyword) |
|
request_regex = True |
|
|
|
for column in ['copyright', 'character', 'artist', 'meta', 'general']: |
|
if df[column].dtype == 'object': |
|
if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True) |
|
else: keyword_mask |= df[column].str.contains(keyword, na=False) |
|
|
|
final_mask &= keyword_mask |
|
|
|
return df[final_mask] |
|
|
|
def filter_rows_not_containing_all_keywords(df, keywords): |
|
special_chars = r".^$*+?{}[]\|()" |
|
|
|
final_mask = pd.Series([True] * len(df), index=df.index) |
|
|
|
|
|
for keyword in keywords: |
|
keyword_mask = pd.Series([False] * len(df), index=df.index) |
|
request_regex = False |
|
if any(char in keyword for char in special_chars): |
|
keyword = re.escape(keyword) |
|
request_regex = True |
|
for column in ['copyright', 'character', 'artist', 'meta', 'general']: |
|
if df[column].dtype == 'object': |
|
if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True) |
|
else: keyword_mask |= df[column].str.contains(keyword, na=False) |
|
|
|
|
|
final_mask &= ~keyword_mask |
|
|
|
return df[final_mask] |
|
|
|
def process_asterisk_group(df, asterisk_group): |
|
special_chars = r".^$*+?{}[]\|()" |
|
|
|
asterisk_keywords = [keyword.lstrip('*') + ',' for keyword in asterisk_group] |
|
|
|
|
|
df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1) |
|
for keyword in asterisk_keywords: |
|
request_regex = False |
|
if any(char in keyword for char in special_chars): |
|
keyword = re.escape(keyword) |
|
request_regex = True |
|
if request_regex: df = df[df['search_string'].str.contains(keyword, na=False, regex=True)] |
|
else: df = df[df['search_string'].str.contains(keyword, na=False)] |
|
df.drop('search_string', axis=1, inplace=True) |
|
|
|
return df |
|
|
|
def process_perfect_negative_group(df, perfect_negative_group): |
|
special_chars = r".^$*+?{}[]\|()" |
|
|
|
perfect_negative_keywords = [keyword.lstrip('~') + ',' for keyword in perfect_negative_group] |
|
|
|
|
|
df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1) |
|
|
|
|
|
combined_mask = pd.Series([True] * len(df), index=df.index) |
|
for keyword in perfect_negative_keywords: |
|
request_regex = False |
|
if any(char in keyword for char in special_chars): |
|
keyword = re.escape(keyword) |
|
request_regex = True |
|
if request_regex: keyword_mask = df['search_string'].str.contains(keyword, na=False, regex=True) |
|
else: |
|
keyword_mask = df['search_string'].str.contains(keyword, na=False) |
|
combined_mask &= ~keyword_mask |
|
|
|
|
|
df = df[combined_mask] |
|
|
|
|
|
df.drop('search_string', axis=1, inplace=True) |
|
|
|
return df |
|
|
|
def extract_and_split(search_request): |
|
curly_brace_group = [] |
|
while '{' in search_request: |
|
start_index = search_request.find('{') |
|
end_index = search_request.find('}') |
|
if end_index != -1: |
|
curly_brace_content = search_request[start_index:end_index + 1] |
|
curly_brace_group.append(curly_brace_content) |
|
search_request = search_request.replace(curly_brace_content, '', 1) |
|
else: |
|
break |
|
|
|
split_requests = [item.strip() for item in search_request.split(',') if item.strip()] |
|
return curly_brace_group, split_requests |
|
|
|
def search(df, search_request, exclude_request, E=None, N=None, S=None, G=None): |
|
if(E == 0): |
|
df = df[~(df['rating'] == 'e')] |
|
if(N == 0): |
|
df = df[~(df['rating'] == 'q')] |
|
if(S == 0): |
|
df = df[~(df['rating'] == 's')] |
|
if(G == 0): |
|
df = df[~(df['rating'] == 'g')] |
|
if(len(df) == 0): |
|
return None |
|
|
|
special_chars = r".^$*+?{}[]\|()" |
|
|
|
|
|
|
|
|
|
curly_brace_group, split_requests = extract_and_split(search_request) |
|
asterisk_group = [item for item in split_requests if item.startswith('*')] |
|
normal_group = [item for item in split_requests if item not in curly_brace_group + asterisk_group] |
|
|
|
negative_split_requests = [item.strip() for item in exclude_request.split(',')] |
|
perfect_negative_group = [item for item in negative_split_requests if item.startswith('~')] |
|
negative_group = [item for item in negative_split_requests if item not in perfect_negative_group] |
|
|
|
|
|
if '' in split_requests: |
|
split_requests.remove('') |
|
if '' in negative_split_requests: |
|
negative_split_requests.remove('') |
|
|
|
|
|
if split_requests: |
|
|
|
if normal_group: |
|
df = filter_rows_containing_all_keywords(df, normal_group) |
|
if(len(df) == 0): |
|
return None |
|
|
|
|
|
if curly_brace_group: |
|
for keyword in curly_brace_group: |
|
or_search_keyword = [item.strip() for item in keyword[1:-1].split('|')] |
|
results = pd.DataFrame() |
|
for keyword in or_search_keyword: |
|
keywords = [item.strip() for item in keyword.split(',')] |
|
matched_rows = None |
|
for keyword in keywords: |
|
for column in ['copyright', 'character', 'artist', 'meta', 'general']: |
|
request_regex = False |
|
if any(char in keyword for char in special_chars): |
|
keyword = re.escape(keyword) |
|
request_regex = True |
|
if keyword == keywords[0]: |
|
if request_regex: matched_rows = df[df[column].str.contains(keyword, na=False, regex=True)] |
|
else: matched_rows = df[df[column].str.contains(keyword, na=False)] |
|
else: |
|
print(keyword, len(matched_rows)) |
|
if request_regex: |
|
ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False, regex=True)] |
|
else: |
|
ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False)] |
|
print(keyword, len(matched_rows), len(ndf)) |
|
if not ndf.empty: |
|
matched_rows = ndf.copy() |
|
if keyword == keywords[0] and not matched_rows.empty: |
|
break |
|
else: |
|
if not matched_rows.empty and not ndf.empty: |
|
ndf = None |
|
break |
|
|
|
if not matched_rows.empty: |
|
results = pd.concat([results, matched_rows]) |
|
print(results) |
|
del[[df]] |
|
results = results.drop_duplicates() |
|
df = results.copy() |
|
del[[results]] |
|
if(len(df) == 0): |
|
return None |
|
|
|
|
|
if asterisk_group: |
|
df = process_asterisk_group(df,asterisk_group) |
|
if(len(df) == 0): |
|
return None |
|
|
|
|
|
if negative_split_requests: |
|
if negative_group: |
|
df = filter_rows_not_containing_all_keywords(df, negative_group) |
|
if(len(df) == 0): |
|
return None |
|
|
|
if perfect_negative_group: |
|
df = process_perfect_negative_group(df, perfect_negative_group) |
|
if(len(df) == 0): |
|
return None |
|
return df |
|
|