import pandas as pd import re # 모든 키워드를 포함하는 행만 필터링하는 함수 def filter_rows_containing_all_keywords(df, keywords): special_chars = r".^$*+?{}[]\|()" # 모든 키워드에 대한 boolean mask 초기화, df의 인덱스를 사용 final_mask = pd.Series([True] * len(df), index=df.index) # 각 키워드에 대해 DataFrame의 모든 열을 검사하고 boolean mask 생성 및 저장 for keyword in keywords: keyword_mask = pd.Series([False] * len(df), index=df.index) request_regex = False if any(char in keyword for char in special_chars): keyword = re.escape(keyword) request_regex = True for column in ['copyright', 'character', 'artist', 'meta', 'general']: if df[column].dtype == 'object': if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True) else: keyword_mask |= df[column].str.contains(keyword, na=False) final_mask &= keyword_mask return df[final_mask] def filter_rows_not_containing_all_keywords(df, keywords): special_chars = r".^$*+?{}[]\|()" # 모든 키워드를 포함하지 않는 행을 필터링하기 위한 boolean mask 초기화, df의 인덱스를 사용 final_mask = pd.Series([True] * len(df), index=df.index) # 각 키워드에 대해 DataFrame의 모든 열을 검사하고 boolean mask 생성 및 저장 for keyword in keywords: keyword_mask = pd.Series([False] * len(df), index=df.index) request_regex = False if any(char in keyword for char in special_chars): keyword = re.escape(keyword) request_regex = True for column in ['copyright', 'character', 'artist', 'meta', 'general']: if df[column].dtype == 'object': if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True) else: keyword_mask |= df[column].str.contains(keyword, na=False) # 모든 키워드를 포함하는 행에 대한 mask를 반전시켜 final_mask에 저장 final_mask &= ~keyword_mask return df[final_mask] def process_asterisk_group(df, asterisk_group): special_chars = r".^$*+?{}[]\|()" # 각 키워드 앞의 '*'를 제거하고 맨 뒤에 ',' 추가 asterisk_keywords = [keyword.lstrip('*') + ',' for keyword in asterisk_group] # 각 행에 대해 임시 문자열 search_string을 만들고 검색 수행 df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1) for keyword in asterisk_keywords: request_regex = False if any(char in keyword for char in special_chars): keyword = re.escape(keyword) request_regex = True if request_regex: df = df[df['search_string'].str.contains(keyword, na=False, regex=True)] else: df = df[df['search_string'].str.contains(keyword, na=False)] df.drop('search_string', axis=1, inplace=True) return df def process_perfect_negative_group(df, perfect_negative_group): special_chars = r".^$*+?{}[]\|()" # 각 키워드 앞의 '~'를 제거하고 맨 뒤에 ',' 추가 perfect_negative_keywords = [keyword.lstrip('~') + ',' for keyword in perfect_negative_group] # 각 행에 대해 임시 문자열 search_string을 만듦 df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1) # 모든 키워드에 대한 검색 결과를 하나의 boolean Series로 결합 combined_mask = pd.Series([True] * len(df), index=df.index) for keyword in perfect_negative_keywords: request_regex = False if any(char in keyword for char in special_chars): keyword = re.escape(keyword) request_regex = True if request_regex: keyword_mask = df['search_string'].str.contains(keyword, na=False, regex=True) else: keyword_mask = df['search_string'].str.contains(keyword, na=False) combined_mask &= ~keyword_mask # 최종적으로 일치하지 않는 행만 필터링 df = df[combined_mask] # search_string 열 제거 df.drop('search_string', axis=1, inplace=True) return df def extract_and_split(search_request): curly_brace_group = [] while '{' in search_request: start_index = search_request.find('{') end_index = search_request.find('}') if end_index != -1: curly_brace_content = search_request[start_index:end_index + 1] curly_brace_group.append(curly_brace_content) search_request = search_request.replace(curly_brace_content, '', 1) else: break split_requests = [item.strip() for item in search_request.split(',') if item.strip()] return curly_brace_group, split_requests def search(df, search_request, exclude_request, E=None, N=None, S=None, G=None): if(E == 0): df = df[~(df['rating'] == 'e')] if(N == 0): df = df[~(df['rating'] == 'q')] if(S == 0): df = df[~(df['rating'] == 's')] if(G == 0): df = df[~(df['rating'] == 'g')] if(len(df) == 0): return None special_chars = r".^$*+?{}[]\|()" #search_request에 대한 처리 #처리순서 normal -> curly -> asterisk #solo, 1girl, {hololive, animal ears|nijisanji, loli} curly_brace_group, split_requests = extract_and_split(search_request) asterisk_group = [item for item in split_requests if item.startswith('*')] normal_group = [item for item in split_requests if item not in curly_brace_group + asterisk_group] #normal_group = [re.escape(item) if any(char in item for char in special_chars) else item for item in normal_group] negative_split_requests = [item.strip() for item in exclude_request.split(',')] perfect_negative_group = [item for item in negative_split_requests if item.startswith('~')] negative_group = [item for item in negative_split_requests if item not in perfect_negative_group] #negative_group = [re.escape(item) if any(char in item for char in special_chars) else item for item in negative_group] if '' in split_requests: split_requests.remove('') if '' in negative_split_requests: negative_split_requests.remove('') #포지티브 if split_requests: #normal 처리 if normal_group: df = filter_rows_containing_all_keywords(df, normal_group) if(len(df) == 0): return None #OR 처리 ndf = None if curly_brace_group: for keyword in curly_brace_group: or_search_keyword = [item.strip() for item in keyword[1:-1].split('|')] results = pd.DataFrame() for keyword in or_search_keyword: keywords = [item.strip() for item in keyword.split(',')] matched_rows = pd.DataFrame() for keyword in keywords: ndfs = [] request_regex = False if any(char in keyword for char in special_chars): keyword = re.escape(keyword) request_regex = True for column in ['copyright', 'character', 'artist', 'meta', 'general']: if keyword == keywords[0] or keyword == re.escape(keywords[0]): if request_regex: matched_rows = pd.concat([matched_rows, df[df[column].str.contains(keyword, na=False, regex=True)]], ignore_index=True) else: matched_rows = pd.concat([matched_rows, df[df[column].str.contains(keyword, na=False)]], ignore_index=True) else: if request_regex: ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False, regex=True)] else: ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False)] print(keyword, len(matched_rows), len(ndf)) if not ndf.empty: ndfs.append(ndf.copy()) del(ndf) ndf = None if ndfs: matched_rows = pd.concat(ndfs, ignore_index=True) matched_rows = matched_rows.drop_duplicates(subset=['general']) ndfs.clear() else: matched_rows.drop_duplicates(subset=['general']) if not matched_rows.empty: results = pd.concat([results, matched_rows]) del[matched_rows] print(results) del[[df]] results = results.drop_duplicates(subset=['general']) df = results.copy() del[[results]] if(len(df) == 0): return None #Perfect Matching 처리 if asterisk_group: df = process_asterisk_group(df,asterisk_group) if(len(df) == 0): return None #Exclude 처리 if negative_split_requests: if negative_group: df = filter_rows_not_containing_all_keywords(df, negative_group) if(len(df) == 0): return None if perfect_negative_group: df = process_perfect_negative_group(df, perfect_negative_group) if(len(df) == 0): return None return df