baqu2213's picture
Upload 4 files
52593ff
raw
history blame
No virus
9.68 kB
import pandas as pd
import re
# ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜๋Š” ํ–‰๋งŒ ํ•„ํ„ฐ๋งํ•˜๋Š” ํ•จ์ˆ˜
def filter_rows_containing_all_keywords(df, keywords):
special_chars = r".^$*+?{}[]\|()"
# ๋ชจ๋“  ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ boolean mask ์ดˆ๊ธฐํ™”, df์˜ ์ธ๋ฑ์Šค๋ฅผ ์‚ฌ์šฉ
final_mask = pd.Series([True] * len(df), index=df.index)
# ๊ฐ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด DataFrame์˜ ๋ชจ๋“  ์—ด์„ ๊ฒ€์‚ฌํ•˜๊ณ  boolean mask ์ƒ์„ฑ ๋ฐ ์ €์žฅ
for keyword in keywords:
keyword_mask = pd.Series([False] * len(df), index=df.index)
request_regex = False
if any(char in keyword for char in special_chars):
keyword = re.escape(keyword)
request_regex = True
for column in ['copyright', 'character', 'artist', 'meta', 'general']:
if df[column].dtype == 'object':
if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True)
else: keyword_mask |= df[column].str.contains(keyword, na=False)
final_mask &= keyword_mask
return df[final_mask]
def filter_rows_not_containing_all_keywords(df, keywords):
special_chars = r".^$*+?{}[]\|()"
# ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜์ง€ ์•Š๋Š” ํ–‰์„ ํ•„ํ„ฐ๋งํ•˜๊ธฐ ์œ„ํ•œ boolean mask ์ดˆ๊ธฐํ™”, df์˜ ์ธ๋ฑ์Šค๋ฅผ ์‚ฌ์šฉ
final_mask = pd.Series([True] * len(df), index=df.index)
# ๊ฐ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด DataFrame์˜ ๋ชจ๋“  ์—ด์„ ๊ฒ€์‚ฌํ•˜๊ณ  boolean mask ์ƒ์„ฑ ๋ฐ ์ €์žฅ
for keyword in keywords:
keyword_mask = pd.Series([False] * len(df), index=df.index)
request_regex = False
if any(char in keyword for char in special_chars):
keyword = re.escape(keyword)
request_regex = True
for column in ['copyright', 'character', 'artist', 'meta', 'general']:
if df[column].dtype == 'object':
if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True)
else: keyword_mask |= df[column].str.contains(keyword, na=False)
# ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜๋Š” ํ–‰์— ๋Œ€ํ•œ mask๋ฅผ ๋ฐ˜์ „์‹œ์ผœ final_mask์— ์ €์žฅ
final_mask &= ~keyword_mask
return df[final_mask]
def process_asterisk_group(df, asterisk_group):
special_chars = r".^$*+?{}[]\|()"
# ๊ฐ ํ‚ค์›Œ๋“œ ์•ž์˜ '*'๋ฅผ ์ œ๊ฑฐํ•˜๊ณ  ๋งจ ๋’ค์— ',' ์ถ”๊ฐ€
asterisk_keywords = [keyword.lstrip('*') + ',' for keyword in asterisk_group]
# ๊ฐ ํ–‰์— ๋Œ€ํ•ด ์ž„์‹œ ๋ฌธ์ž์—ด search_string์„ ๋งŒ๋“ค๊ณ  ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1)
for keyword in asterisk_keywords:
request_regex = False
if any(char in keyword for char in special_chars):
keyword = re.escape(keyword)
request_regex = True
if request_regex: df = df[df['search_string'].str.contains(keyword, na=False, regex=True)]
else: df = df[df['search_string'].str.contains(keyword, na=False)]
df.drop('search_string', axis=1, inplace=True)
return df
def process_perfect_negative_group(df, perfect_negative_group):
special_chars = r".^$*+?{}[]\|()"
# ๊ฐ ํ‚ค์›Œ๋“œ ์•ž์˜ '~'๋ฅผ ์ œ๊ฑฐํ•˜๊ณ  ๋งจ ๋’ค์— ',' ์ถ”๊ฐ€
perfect_negative_keywords = [keyword.lstrip('~') + ',' for keyword in perfect_negative_group]
# ๊ฐ ํ–‰์— ๋Œ€ํ•ด ์ž„์‹œ ๋ฌธ์ž์—ด search_string์„ ๋งŒ๋“ฆ
df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1)
# ๋ชจ๋“  ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ํ•˜๋‚˜์˜ boolean Series๋กœ ๊ฒฐํ•ฉ
combined_mask = pd.Series([True] * len(df), index=df.index)
for keyword in perfect_negative_keywords:
request_regex = False
if any(char in keyword for char in special_chars):
keyword = re.escape(keyword)
request_regex = True
if request_regex: keyword_mask = df['search_string'].str.contains(keyword, na=False, regex=True)
else:
keyword_mask = df['search_string'].str.contains(keyword, na=False)
combined_mask &= ~keyword_mask
# ์ตœ์ข…์ ์œผ๋กœ ์ผ์น˜ํ•˜์ง€ ์•Š๋Š” ํ–‰๋งŒ ํ•„ํ„ฐ๋ง
df = df[combined_mask]
# search_string ์—ด ์ œ๊ฑฐ
df.drop('search_string', axis=1, inplace=True)
return df
def extract_and_split(search_request):
curly_brace_group = []
while '{' in search_request:
start_index = search_request.find('{')
end_index = search_request.find('}')
if end_index != -1:
curly_brace_content = search_request[start_index:end_index + 1]
curly_brace_group.append(curly_brace_content)
search_request = search_request.replace(curly_brace_content, '', 1)
else:
break
split_requests = [item.strip() for item in search_request.split(',') if item.strip()]
return curly_brace_group, split_requests
def search(df, search_request, exclude_request, E=None, N=None, S=None, G=None):
if(E == 0):
df = df[~(df['rating'] == 'e')]
if(N == 0):
df = df[~(df['rating'] == 'q')]
if(S == 0):
df = df[~(df['rating'] == 's')]
if(G == 0):
df = df[~(df['rating'] == 'g')]
if(len(df) == 0):
return None
special_chars = r".^$*+?{}[]\|()"
#search_request์— ๋Œ€ํ•œ ์ฒ˜๋ฆฌ
#์ฒ˜๋ฆฌ์ˆœ์„œ normal -> curly -> asterisk
#solo, 1girl, {hololive, animal ears|nijisanji, loli}
curly_brace_group, split_requests = extract_and_split(search_request)
asterisk_group = [item for item in split_requests if item.startswith('*')]
normal_group = [item for item in split_requests if item not in curly_brace_group + asterisk_group]
#normal_group = [re.escape(item) if any(char in item for char in special_chars) else item for item in normal_group]
negative_split_requests = [item.strip() for item in exclude_request.split(',')]
perfect_negative_group = [item for item in negative_split_requests if item.startswith('~')]
negative_group = [item for item in negative_split_requests if item not in perfect_negative_group]
#negative_group = [re.escape(item) if any(char in item for char in special_chars) else item for item in negative_group]
if '' in split_requests:
split_requests.remove('')
if '' in negative_split_requests:
negative_split_requests.remove('')
#ํฌ์ง€ํ‹ฐ๋ธŒ
if split_requests:
#normal ์ฒ˜๋ฆฌ
if normal_group:
df = filter_rows_containing_all_keywords(df, normal_group)
if(len(df) == 0):
return None
#OR ์ฒ˜๋ฆฌ
ndf = None
if curly_brace_group:
for keyword in curly_brace_group:
or_search_keyword = [item.strip() for item in keyword[1:-1].split('|')]
results = pd.DataFrame()
for keyword in or_search_keyword:
keywords = [item.strip() for item in keyword.split(',')]
matched_rows = pd.DataFrame()
for keyword in keywords:
ndfs = []
request_regex = False
if any(char in keyword for char in special_chars):
keyword = re.escape(keyword)
request_regex = True
for column in ['copyright', 'character', 'artist', 'meta', 'general']:
if keyword == keywords[0] or keyword == re.escape(keywords[0]):
if request_regex: matched_rows = pd.concat([matched_rows, df[df[column].str.contains(keyword, na=False, regex=True)]], ignore_index=True)
else: matched_rows = pd.concat([matched_rows, df[df[column].str.contains(keyword, na=False)]], ignore_index=True)
else:
if request_regex:
ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False, regex=True)]
else:
ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False)]
print(keyword, len(matched_rows), len(ndf))
if not ndf.empty:
ndfs.append(ndf.copy())
del(ndf)
ndf = None
if ndfs:
matched_rows = pd.concat(ndfs, ignore_index=True)
matched_rows = matched_rows.drop_duplicates(subset=['general'])
ndfs.clear()
else:
matched_rows.drop_duplicates(subset=['general'])
if not matched_rows.empty:
results = pd.concat([results, matched_rows])
del[matched_rows]
print(results)
del[[df]]
results = results.drop_duplicates(subset=['general'])
df = results.copy()
del[[results]]
if(len(df) == 0):
return None
#Perfect Matching ์ฒ˜๋ฆฌ
if asterisk_group:
df = process_asterisk_group(df,asterisk_group)
if(len(df) == 0):
return None
#Exclude ์ฒ˜๋ฆฌ
if negative_split_requests:
if negative_group:
df = filter_rows_not_containing_all_keywords(df, negative_group)
if(len(df) == 0):
return None
if perfect_negative_group:
df = process_perfect_negative_group(df, perfect_negative_group)
if(len(df) == 0):
return None
return df