File size: 9,683 Bytes
f5cf8c0
1abae11
f5cf8c0
 
 
1abae11
f5cf8c0
 
 
 
 
 
1abae11
 
 
 
f5cf8c0
 
 
1abae11
 
f5cf8c0
 
 
 
 
 
1abae11
f5cf8c0
 
 
 
 
 
1abae11
 
 
 
f5cf8c0
 
1abae11
 
f5cf8c0
 
 
 
 
 
 
1abae11
f5cf8c0
 
 
 
 
 
1abae11
 
 
 
 
 
f5cf8c0
 
 
 
 
1abae11
f5cf8c0
 
 
 
 
 
 
 
 
1abae11
 
 
 
 
dc687d5
 
f5cf8c0
 
 
 
 
 
 
 
 
 
dbea0b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5cf8c0
 
 
 
 
 
 
 
 
 
 
1abae11
 
f5cf8c0
 
dbea0b3
 
f5cf8c0
 
1abae11
f5cf8c0
 
 
1abae11
f5cf8c0
 
 
 
 
 
 
 
 
 
 
 
 
 
dbea0b3
fa907ea
dbea0b3
 
 
 
 
 
52593ff
dbea0b3
fa907ea
52593ff
 
dbea0b3
 
52593ff
 
 
 
dbea0b3
 
 
 
 
 
 
fa907ea
 
dbea0b3
fa907ea
 
 
 
 
 
dbea0b3
 
52593ff
dbea0b3
 
fa907ea
dbea0b3
 
 
 
 
 
 
 
 
 
f5cf8c0
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import pandas as pd
import re

# ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜๋Š” ํ–‰๋งŒ ํ•„ํ„ฐ๋งํ•˜๋Š” ํ•จ์ˆ˜
def filter_rows_containing_all_keywords(df, keywords):
    special_chars = r".^$*+?{}[]\|()"
    # ๋ชจ๋“  ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ boolean mask ์ดˆ๊ธฐํ™”, df์˜ ์ธ๋ฑ์Šค๋ฅผ ์‚ฌ์šฉ
    final_mask = pd.Series([True] * len(df), index=df.index)

    # ๊ฐ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด DataFrame์˜ ๋ชจ๋“  ์—ด์„ ๊ฒ€์‚ฌํ•˜๊ณ  boolean mask ์ƒ์„ฑ ๋ฐ ์ €์žฅ
    for keyword in keywords:
        keyword_mask = pd.Series([False] * len(df), index=df.index)
        request_regex = False
        if any(char in keyword for char in special_chars): 
            keyword = re.escape(keyword)
            request_regex = True

        for column in ['copyright', 'character', 'artist', 'meta', 'general']:
            if df[column].dtype == 'object':
                if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True)
                else: keyword_mask |= df[column].str.contains(keyword, na=False)

        final_mask &= keyword_mask

    return df[final_mask]

def filter_rows_not_containing_all_keywords(df, keywords):
    special_chars = r".^$*+?{}[]\|()"
    # ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜์ง€ ์•Š๋Š” ํ–‰์„ ํ•„ํ„ฐ๋งํ•˜๊ธฐ ์œ„ํ•œ boolean mask ์ดˆ๊ธฐํ™”, df์˜ ์ธ๋ฑ์Šค๋ฅผ ์‚ฌ์šฉ
    final_mask = pd.Series([True] * len(df), index=df.index)

    # ๊ฐ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด DataFrame์˜ ๋ชจ๋“  ์—ด์„ ๊ฒ€์‚ฌํ•˜๊ณ  boolean mask ์ƒ์„ฑ ๋ฐ ์ €์žฅ
    for keyword in keywords:
        keyword_mask = pd.Series([False] * len(df), index=df.index)
        request_regex = False
        if any(char in keyword for char in special_chars): 
            keyword = re.escape(keyword)
            request_regex = True
        for column in ['copyright', 'character', 'artist', 'meta', 'general']:
            if df[column].dtype == 'object':
                if request_regex: keyword_mask |= df[column].str.contains(keyword, na=False, regex=True)
                else: keyword_mask |= df[column].str.contains(keyword, na=False)

        # ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜๋Š” ํ–‰์— ๋Œ€ํ•œ mask๋ฅผ ๋ฐ˜์ „์‹œ์ผœ final_mask์— ์ €์žฅ
        final_mask &= ~keyword_mask

    return df[final_mask]

def process_asterisk_group(df, asterisk_group):
    special_chars = r".^$*+?{}[]\|()"
    # ๊ฐ ํ‚ค์›Œ๋“œ ์•ž์˜ '*'๋ฅผ ์ œ๊ฑฐํ•˜๊ณ  ๋งจ ๋’ค์— ',' ์ถ”๊ฐ€
    asterisk_keywords = [keyword.lstrip('*') + ',' for keyword in asterisk_group]

    # ๊ฐ ํ–‰์— ๋Œ€ํ•ด ์ž„์‹œ ๋ฌธ์ž์—ด search_string์„ ๋งŒ๋“ค๊ณ  ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
    df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1)
    for keyword in asterisk_keywords:
        request_regex = False
        if any(char in keyword for char in special_chars): 
            keyword = re.escape(keyword)
            request_regex = True
        if request_regex: df = df[df['search_string'].str.contains(keyword, na=False, regex=True)]
        else:  df = df[df['search_string'].str.contains(keyword, na=False)]
    df.drop('search_string', axis=1, inplace=True)

    return df

def process_perfect_negative_group(df, perfect_negative_group):
    special_chars = r".^$*+?{}[]\|()"
    # ๊ฐ ํ‚ค์›Œ๋“œ ์•ž์˜ '~'๋ฅผ ์ œ๊ฑฐํ•˜๊ณ  ๋งจ ๋’ค์— ',' ์ถ”๊ฐ€
    perfect_negative_keywords = [keyword.lstrip('~') + ',' for keyword in perfect_negative_group]

    # ๊ฐ ํ–‰์— ๋Œ€ํ•ด ์ž„์‹œ ๋ฌธ์ž์—ด search_string์„ ๋งŒ๋“ฆ
    df['search_string'] = df[['copyright', 'character', 'artist', 'meta', 'general']].apply(lambda x: ' ' + ', '.join(x.astype(str)) + ',', axis=1)

    # ๋ชจ๋“  ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ํ•˜๋‚˜์˜ boolean Series๋กœ ๊ฒฐํ•ฉ
    combined_mask = pd.Series([True] * len(df), index=df.index)
    for keyword in perfect_negative_keywords:
        request_regex = False
        if any(char in keyword for char in special_chars): 
            keyword = re.escape(keyword)
            request_regex = True
        if request_regex: keyword_mask = df['search_string'].str.contains(keyword, na=False, regex=True)
        else: 
            keyword_mask = df['search_string'].str.contains(keyword, na=False)
        combined_mask &= ~keyword_mask

    # ์ตœ์ข…์ ์œผ๋กœ ์ผ์น˜ํ•˜์ง€ ์•Š๋Š” ํ–‰๋งŒ ํ•„ํ„ฐ๋ง
    df = df[combined_mask]

    # search_string ์—ด ์ œ๊ฑฐ
    df.drop('search_string', axis=1, inplace=True)

    return df

def extract_and_split(search_request):
    curly_brace_group = []
    while '{' in search_request:
        start_index = search_request.find('{')
        end_index = search_request.find('}')
        if end_index != -1:
            curly_brace_content = search_request[start_index:end_index + 1]
            curly_brace_group.append(curly_brace_content)
            search_request = search_request.replace(curly_brace_content, '', 1)
        else:
            break

    split_requests = [item.strip() for item in search_request.split(',') if item.strip()]
    return curly_brace_group, split_requests

def search(df, search_request, exclude_request, E=None, N=None, S=None, G=None):
    if(E == 0):
        df = df[~(df['rating'] == 'e')]
    if(N == 0):
        df = df[~(df['rating'] == 'q')]
    if(S == 0):
        df = df[~(df['rating'] == 's')]
    if(G == 0):
        df = df[~(df['rating'] == 'g')]
    if(len(df) == 0):
        return None
    
    special_chars = r".^$*+?{}[]\|()"

    #search_request์— ๋Œ€ํ•œ ์ฒ˜๋ฆฌ
    #์ฒ˜๋ฆฌ์ˆœ์„œ normal -> curly -> asterisk
    #solo, 1girl, {hololive, animal ears|nijisanji, loli}
    curly_brace_group, split_requests = extract_and_split(search_request)
    asterisk_group = [item for item in split_requests if item.startswith('*')]
    normal_group = [item for item in split_requests if item not in curly_brace_group + asterisk_group]
    #normal_group = [re.escape(item) if any(char in item for char in special_chars) else item for item in normal_group]
    negative_split_requests = [item.strip() for item in exclude_request.split(',')]
    perfect_negative_group = [item for item in negative_split_requests if item.startswith('~')]
    negative_group = [item for item in negative_split_requests if item not in perfect_negative_group]
    #negative_group = [re.escape(item) if any(char in item for char in special_chars) else item for item in negative_group]

    if '' in split_requests:
        split_requests.remove('')
    if '' in negative_split_requests:
        negative_split_requests.remove('')

    #ํฌ์ง€ํ‹ฐ๋ธŒ
    if split_requests:
        #normal ์ฒ˜๋ฆฌ
        if normal_group:
            df = filter_rows_containing_all_keywords(df, normal_group)
            if(len(df) == 0):
                return None

    #OR ์ฒ˜๋ฆฌ
    ndf = None
    if curly_brace_group:
        for keyword in curly_brace_group:
            or_search_keyword = [item.strip() for item in keyword[1:-1].split('|')]
            results = pd.DataFrame()
            for keyword in or_search_keyword:
                keywords = [item.strip() for item in keyword.split(',')]
                matched_rows = pd.DataFrame()
                for keyword in keywords:
                    ndfs = []
                    request_regex = False
                    if any(char in keyword for char in special_chars): 
                            keyword = re.escape(keyword)
                            request_regex = True
                    for column in ['copyright', 'character', 'artist', 'meta', 'general']:
                        if keyword == keywords[0] or keyword == re.escape(keywords[0]):
                            if request_regex: matched_rows = pd.concat([matched_rows, df[df[column].str.contains(keyword, na=False, regex=True)]], ignore_index=True)
                            else: matched_rows = pd.concat([matched_rows, df[df[column].str.contains(keyword, na=False)]], ignore_index=True)
                        else:
                            if request_regex:
                                ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False, regex=True)]
                            else: 
                                ndf = matched_rows[matched_rows[column].str.contains(keyword, na=False)]
                            print(keyword, len(matched_rows), len(ndf))
                            if not ndf.empty:
                                ndfs.append(ndf.copy())
                                del(ndf)
                                ndf = None
                    if ndfs:
                        matched_rows = pd.concat(ndfs, ignore_index=True)
                        matched_rows = matched_rows.drop_duplicates(subset=['general'])
                        ndfs.clear()
                    else:
                        matched_rows.drop_duplicates(subset=['general'])
                if not matched_rows.empty:
                    results = pd.concat([results, matched_rows])
                    del[matched_rows]
                    print(results)
            del[[df]]
            results = results.drop_duplicates(subset=['general'])
            df = results.copy()
            del[[results]]
        if(len(df) == 0):
            return None
    
    #Perfect Matching ์ฒ˜๋ฆฌ
    if asterisk_group:
        df = process_asterisk_group(df,asterisk_group)
        if(len(df) == 0):
            return None
    
    #Exclude ์ฒ˜๋ฆฌ
    if negative_split_requests:
        if negative_group:
            df = filter_rows_not_containing_all_keywords(df, negative_group)
            if(len(df) == 0):
                return None
        
        if perfect_negative_group:
            df = process_perfect_negative_group(df, perfect_negative_group)
            if(len(df) == 0):
                return None
    return df