Add n-gram update
Browse files
main.py
CHANGED
@@ -25,15 +25,16 @@ def add_dict(a, b):
|
|
25 |
return temp
|
26 |
|
27 |
class Chatbot:
|
28 |
-
def __init__(self, name=None, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
|
29 |
self.name = name
|
30 |
self.letter_replace = letter_replace
|
31 |
self.frequency_weight = frequency_weight
|
32 |
self.div_by_len = div_by_len
|
33 |
self.model = {}
|
|
|
34 |
if data is not None:
|
35 |
self.train(data)
|
36 |
-
def tokenize(self, text: str):
|
37 |
preprocess = ""
|
38 |
for x in text.lower():
|
39 |
if x in letters:
|
@@ -42,8 +43,13 @@ class Chatbot:
|
|
42 |
else:
|
43 |
preprocess += x
|
44 |
else:
|
45 |
-
preprocess += " "+x+" "
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
47 |
def train(self, data: dict):
|
48 |
lendata = len(data)
|
49 |
lendata_div = 1/lendata
|
|
|
25 |
return temp
|
26 |
|
27 |
class Chatbot:
|
28 |
+
def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
|
29 |
self.name = name
|
30 |
self.letter_replace = letter_replace
|
31 |
self.frequency_weight = frequency_weight
|
32 |
self.div_by_len = div_by_len
|
33 |
self.model = {}
|
34 |
+
self.n = n-1
|
35 |
if data is not None:
|
36 |
self.train(data)
|
37 |
+
def tokenize(self, text: str, n: int = 1):
|
38 |
preprocess = ""
|
39 |
for x in text.lower():
|
40 |
if x in letters:
|
|
|
43 |
else:
|
44 |
preprocess += x
|
45 |
else:
|
46 |
+
preprocess += " " + x + " "
|
47 |
+
tokens = preprocess.split()
|
48 |
+
output = tokens.copy()
|
49 |
+
for i in range(self.n):
|
50 |
+
for num, word in enumerate(tokens[:-i]):
|
51 |
+
output.append(' '.join(tokens[num:num+i]))
|
52 |
+
return output
|
53 |
def train(self, data: dict):
|
54 |
lendata = len(data)
|
55 |
lendata_div = 1/lendata
|