seyia92coding commited on
Commit
cf8a101
1 Parent(s): b1fea4e

Upload fuzz.py

Browse files
Files changed (1) hide show
  1. fuzz.py +306 -0
fuzz.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+ from __future__ import unicode_literals
4
+ import platform
5
+ import warnings
6
+
7
+ try:
8
+ from .StringMatcher import StringMatcher as SequenceMatcher
9
+ except ImportError:
10
+ if platform.python_implementation() != "PyPy":
11
+ warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
12
+ from difflib import SequenceMatcher
13
+
14
+ from . import utils
15
+
16
+
17
+ ###########################
18
+ # Basic Scoring Functions #
19
+ ###########################
20
+
21
+ @utils.check_for_none
22
+ @utils.check_for_equivalence
23
+ @utils.check_empty_string
24
+ def ratio(s1, s2):
25
+ s1, s2 = utils.make_type_consistent(s1, s2)
26
+
27
+ m = SequenceMatcher(None, s1, s2)
28
+ return utils.intr(100 * m.ratio())
29
+
30
+
31
+ @utils.check_for_none
32
+ @utils.check_for_equivalence
33
+ @utils.check_empty_string
34
+ def partial_ratio(s1, s2):
35
+ """"Return the ratio of the most similar substring
36
+ as a number between 0 and 100."""
37
+ s1, s2 = utils.make_type_consistent(s1, s2)
38
+
39
+ if len(s1) <= len(s2):
40
+ shorter = s1
41
+ longer = s2
42
+ else:
43
+ shorter = s2
44
+ longer = s1
45
+
46
+ m = SequenceMatcher(None, shorter, longer)
47
+ blocks = m.get_matching_blocks()
48
+
49
+ # each block represents a sequence of matching characters in a string
50
+ # of the form (idx_1, idx_2, len)
51
+ # the best partial match will block align with at least one of those blocks
52
+ # e.g. shorter = "abcd", longer = XXXbcdeEEE
53
+ # block = (1,3,3)
54
+ # best score === ratio("abcd", "Xbcd")
55
+ scores = []
56
+ for block in blocks:
57
+ long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
58
+ long_end = long_start + len(shorter)
59
+ long_substr = longer[long_start:long_end]
60
+
61
+ m2 = SequenceMatcher(None, shorter, long_substr)
62
+ r = m2.ratio()
63
+ if r > .995:
64
+ return 100
65
+ else:
66
+ scores.append(r)
67
+
68
+ return utils.intr(100 * max(scores))
69
+
70
+
71
+ ##############################
72
+ # Advanced Scoring Functions #
73
+ ##############################
74
+
75
+ def _process_and_sort(s, force_ascii, full_process=True):
76
+ """Return a cleaned string with token sorted."""
77
+ # pull tokens
78
+ ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
79
+ tokens = ts.split()
80
+
81
+ # sort tokens and join
82
+ sorted_string = u" ".join(sorted(tokens))
83
+ return sorted_string.strip()
84
+
85
+
86
+ # Sorted Token
87
+ # find all alphanumeric tokens in the string
88
+ # sort those tokens and take ratio of resulting joined strings
89
+ # controls for unordered string elements
90
+ @utils.check_for_none
91
+ def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
92
+ sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
93
+ sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)
94
+
95
+ if partial:
96
+ return partial_ratio(sorted1, sorted2)
97
+ else:
98
+ return ratio(sorted1, sorted2)
99
+
100
+
101
+ def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
102
+ """Return a measure of the sequences' similarity between 0 and 100
103
+ but sorting the token before comparing.
104
+ """
105
+ return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
106
+
107
+
108
+ def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
109
+ """Return the ratio of the most similar substring as a number between
110
+ 0 and 100 but sorting the token before comparing.
111
+ """
112
+ return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
113
+
114
+
115
+ @utils.check_for_none
116
+ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
117
+ """Find all alphanumeric tokens in each string...
118
+ - treat them as a set
119
+ - construct two strings of the form:
120
+ <sorted_intersection><sorted_remainder>
121
+ - take ratios of those two strings
122
+ - controls for unordered partial matches"""
123
+
124
+ if not full_process and s1 == s2:
125
+ return 100
126
+
127
+ p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
128
+ p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
129
+
130
+ if not utils.validate_string(p1):
131
+ return 0
132
+ if not utils.validate_string(p2):
133
+ return 0
134
+
135
+ # pull tokens
136
+ tokens1 = set(p1.split())
137
+ tokens2 = set(p2.split())
138
+
139
+ intersection = tokens1.intersection(tokens2)
140
+ diff1to2 = tokens1.difference(tokens2)
141
+ diff2to1 = tokens2.difference(tokens1)
142
+
143
+ sorted_sect = " ".join(sorted(intersection))
144
+ sorted_1to2 = " ".join(sorted(diff1to2))
145
+ sorted_2to1 = " ".join(sorted(diff2to1))
146
+
147
+ combined_1to2 = sorted_sect + " " + sorted_1to2
148
+ combined_2to1 = sorted_sect + " " + sorted_2to1
149
+
150
+ # strip
151
+ sorted_sect = sorted_sect.strip()
152
+ combined_1to2 = combined_1to2.strip()
153
+ combined_2to1 = combined_2to1.strip()
154
+
155
+ if partial:
156
+ ratio_func = partial_ratio
157
+ else:
158
+ ratio_func = ratio
159
+
160
+ pairwise = [
161
+ ratio_func(sorted_sect, combined_1to2),
162
+ ratio_func(sorted_sect, combined_2to1),
163
+ ratio_func(combined_1to2, combined_2to1)
164
+ ]
165
+ return max(pairwise)
166
+
167
+
168
+ def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
169
+ return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
170
+
171
+
172
+ def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
173
+ return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
174
+
175
+
176
+ ###################
177
+ # Combination API #
178
+ ###################
179
+
180
+ # q is for quick
181
+ def QRatio(s1, s2, force_ascii=True, full_process=True):
182
+ """
183
+ Quick ratio comparison between two strings.
184
+
185
+ Runs full_process from utils on both strings
186
+ Short circuits if either of the strings is empty after processing.
187
+
188
+ :param s1:
189
+ :param s2:
190
+ :param force_ascii: Allow only ASCII characters (Default: True)
191
+ :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
192
+ :return: similarity ratio
193
+ """
194
+
195
+ if full_process:
196
+ p1 = utils.full_process(s1, force_ascii=force_ascii)
197
+ p2 = utils.full_process(s2, force_ascii=force_ascii)
198
+ else:
199
+ p1 = s1
200
+ p2 = s2
201
+
202
+ if not utils.validate_string(p1):
203
+ return 0
204
+ if not utils.validate_string(p2):
205
+ return 0
206
+
207
+ return ratio(p1, p2)
208
+
209
+
210
+ def UQRatio(s1, s2, full_process=True):
211
+ """
212
+ Unicode quick ratio
213
+
214
+ Calls QRatio with force_ascii set to False
215
+
216
+ :param s1:
217
+ :param s2:
218
+ :return: similarity ratio
219
+ """
220
+ return QRatio(s1, s2, force_ascii=False, full_process=full_process)
221
+
222
+
223
+ # w is for weighted
224
+ def WRatio(s1, s2, force_ascii=True, full_process=True):
225
+ """
226
+ Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
227
+
228
+ **Steps in the order they occur**
229
+
230
+ #. Run full_process from utils on both strings
231
+ #. Short circuit if this makes either string empty
232
+ #. Take the ratio of the two processed strings (fuzz.ratio)
233
+ #. Run checks to compare the length of the strings
234
+ * If one of the strings is more than 1.5 times as long as the other
235
+ use partial_ratio comparisons - scale partial results by 0.9
236
+ (this makes sure only full results can return 100)
237
+ * If one of the strings is over 8 times as long as the other
238
+ instead scale by 0.6
239
+
240
+ #. Run the other ratio functions
241
+ * if using partial ratio functions call partial_ratio,
242
+ partial_token_sort_ratio and partial_token_set_ratio
243
+ scale all of these by the ratio based on length
244
+ * otherwise call token_sort_ratio and token_set_ratio
245
+ * all token based comparisons are scaled by 0.95
246
+ (on top of any partial scalars)
247
+
248
+ #. Take the highest value from these results
249
+ round it and return it as an integer.
250
+
251
+ :param s1:
252
+ :param s2:
253
+ :param force_ascii: Allow only ascii characters
254
+ :type force_ascii: bool
255
+ :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
256
+ :return:
257
+ """
258
+
259
+ if full_process:
260
+ p1 = utils.full_process(s1, force_ascii=force_ascii)
261
+ p2 = utils.full_process(s2, force_ascii=force_ascii)
262
+ else:
263
+ p1 = s1
264
+ p2 = s2
265
+
266
+ if not utils.validate_string(p1):
267
+ return 0
268
+ if not utils.validate_string(p2):
269
+ return 0
270
+
271
+ # should we look at partials?
272
+ try_partial = True
273
+ unbase_scale = .95
274
+ partial_scale = .90
275
+
276
+ base = ratio(p1, p2)
277
+ len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))
278
+
279
+ # if strings are similar length, don't use partials
280
+ if len_ratio < 1.5:
281
+ try_partial = False
282
+
283
+ # if one string is much much shorter than the other
284
+ if len_ratio > 8:
285
+ partial_scale = .6
286
+
287
+ if try_partial:
288
+ partial = partial_ratio(p1, p2) * partial_scale
289
+ ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
290
+ * unbase_scale * partial_scale
291
+ ptser = partial_token_set_ratio(p1, p2, full_process=False) \
292
+ * unbase_scale * partial_scale
293
+
294
+ return utils.intr(max(base, partial, ptsor, ptser))
295
+ else:
296
+ tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
297
+ tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale
298
+
299
+ return utils.intr(max(base, tsor, tser))
300
+
301
+
302
+ def UWRatio(s1, s2, full_process=True):
303
+ """Return a measure of the sequences' similarity between 0 and 100,
304
+ using different algorithms. Same as WRatio but preserving unicode.
305
+ """
306
+ return WRatio(s1, s2, force_ascii=False, full_process=full_process)