Upload 5 files
Browse files- chatbot_kel.py +34 -0
- dataset_kel.json +1 -0
- model_settings_kel.py +4 -0
- responses_kel.txt +47 -0
- tokenizer.py +59 -0
chatbot_kel.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from keras.saving import load_model
|
3 |
+
from keras.preprocessing.text import Tokenizer
|
4 |
+
from keras_self_attention import SeqSelfAttention
|
5 |
+
from model_settings_kel import *
|
6 |
+
import json
|
7 |
+
from tokenizer import *
|
8 |
+
|
9 |
+
|
10 |
+
with open(dataset_file, "r") as f:
|
11 |
+
dset = json.load(f)
|
12 |
+
|
13 |
+
with open(responses_file, "r") as f:
|
14 |
+
lines = [x.rstrip("\n") for x in f.readlines()]
|
15 |
+
|
16 |
+
fit_on_texts(list(dset.keys()))
|
17 |
+
|
18 |
+
model = load_model("chatbot_kel.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
|
19 |
+
|
20 |
+
def find_line_number(array):
|
21 |
+
return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
|
22 |
+
|
23 |
+
def generate(text, verbose=1):
|
24 |
+
tokens = list(tokenize(text.lower())) # text into tokens (almost words)
|
25 |
+
tokens = (tokens+[0,]*inp_len)[:inp_len] # cutting off the sentence after inp_len words
|
26 |
+
prediction = model.predict(np.array([tokens,]), verbose=verbose)[0]
|
27 |
+
line = find_line_number(prediction)
|
28 |
+
return lines[line]
|
29 |
+
|
30 |
+
if __name__ == "__main__": # if this code is not being imported, open the chat
|
31 |
+
while True:
|
32 |
+
inp = input("User: ")
|
33 |
+
gen = generate(inp)
|
34 |
+
if gen != "<null>": print(f"Bot: {gen}")
|
dataset_kel.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"xai.": 0, "hai.": 0, "... hai": 0, "ant, hai.": 0, "ant. xai.": 0, "ant. hai.": 0, "hai snepi": 0, "snepi, xai": 0, "snepi, hai": 0, "xai snepi": 0, "tei trate?": 1, "tei noh?": 2, "teies tai?": 3, "xai. teies tai?": 3, "at": 4, "nat": 5, "tore nat!": 5, "tei tonahe teies tai aek?": 6, "em knato teies tai xais.": 7, "emta": 4, "at. em tane tas.": 8, "tamos krate tei?": 9, "tamos ierhon?": 9, "em tonahe.": 10, "tei nat emta?": 11, "tei knato?": 12, "tei notorama?": 13, "tei chatgpt?": 14, "tei eho hame ehat.": 15, "tei tonahe emes mnor eho?": 15, "em mnor eho?": 15, "eho emes mnor eho.": 15, "em tane atea nat.": 16, "ec?": 17, "tore?": 17, "eho tore?": 17, "tei aek.": 18, "tamos nat atemo?": 19, "tei aer aek!": 18, "ant?": 20, "em tane atea nat": 21, "teies aek lehyn trone?": 22, "teies aek lehaer trone?": 22, "ant, teies aek lehaer trone?": 22, "ant. teies aek lehyn trone?": 22, "teies aek _ tamos?": 17, "ant. teies aek _ tamos?": 17, "ant. teies aek nier automata tamos?": 17, "ant. teies aek breaking bad tamos?": 17, "ant. teies aek mentalist tamos?": 17, "j aek.": 5, "i tonahe tas.": 23, "elc tane tas!": 1, "tei xais.": 5, "5+5=?": 24, "2+2=?": 25, "5*5=?": 26, "tei nat mas eho!": 27, "👍️": 4, "🏹": 4, "😢": 28, "😭": 28, "😿": 28, "😁": 29, "😀": 29, "😃": 29, "😄": 29, "🤣": 30, "😆": 30, "😂": 30, "xaho": 30, "tei?": 31, "tei tonahe mna?": 18, "a": 32, "c": 33, "e": 34, "i": 35, "0": 36, "1": 37, "2": 38, "3": 25, "4": 39, "5": 40, "6": 41, "7": 42, "8": 43, "tas es aek tai.": 29, "tos!": 44, "snepi, tos!": 44, "snepi. tos!": 44, "tos snepi!": 44, "em gouan.": 44, "la tho sa ehk ra es mna...": 45, "tei tonahe nat": 28, "eho aer lehaer trone?": 1, "snepi. eho lehyn trone?": 1, "teies tehst?": 46}
|
model_settings_kel.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_file = "dataset_kel.json"
|
2 |
+
responses_file = "responses_kel.txt"
|
3 |
+
emb_size = 128 # how big are the word vectors in the input (how much information can be fit into one word)
|
4 |
+
inp_len = 10 # limit of the input length, after 10 words the
|
responses_kel.txt
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
xai.
|
2 |
+
at!
|
3 |
+
em trate!
|
4 |
+
emes tai aek! teies?
|
5 |
+
aek.
|
6 |
+
atemo
|
7 |
+
at. em notorama. emes xuak tai nat.
|
8 |
+
em nat tonahe. em notorama.
|
9 |
+
aek
|
10 |
+
ierhon krate em.
|
11 |
+
em eho aer toh nat mas.
|
12 |
+
em knato - at.
|
13 |
+
em tonahe mna toh.
|
14 |
+
at! ierhon krate em glo em es haho.
|
15 |
+
nat, em mna notorama.
|
16 |
+
em nat tonahe mnor eho.
|
17 |
+
elc... em knato... leho aer lehyn trone.
|
18 |
+
em nat tonahe.
|
19 |
+
em mna notorama.
|
20 |
+
em.
|
21 |
+
teies eho atea?
|
22 |
+
eho glo em!
|
23 |
+
nymer n!
|
24 |
+
elc eho tas!
|
25 |
+
10
|
26 |
+
4
|
27 |
+
25
|
28 |
+
em nat mas. eho .set_chance (1-100)
|
29 |
+
xais.
|
30 |
+
aek!
|
31 |
+
xaho!
|
32 |
+
em knato at.
|
33 |
+
c
|
34 |
+
e
|
35 |
+
i
|
36 |
+
k
|
37 |
+
1
|
38 |
+
2
|
39 |
+
3
|
40 |
+
5
|
41 |
+
6
|
42 |
+
7
|
43 |
+
8
|
44 |
+
9
|
45 |
+
tos.
|
46 |
+
ala!!! ala!!!
|
47 |
+
emes tehst es 97.
|
tokenizer.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→"
|
4 |
+
|
5 |
+
def split(text):
|
6 |
+
o = []
|
7 |
+
t = ""
|
8 |
+
for i in text+" ":
|
9 |
+
if i in s:
|
10 |
+
if t != "":
|
11 |
+
o.append(t)
|
12 |
+
t = ""
|
13 |
+
if i != " ":
|
14 |
+
o.append(i)
|
15 |
+
t = ""
|
16 |
+
else:
|
17 |
+
t += i
|
18 |
+
return o
|
19 |
+
|
20 |
+
def tokenize_2str(text: str):
|
21 |
+
text = split(text)
|
22 |
+
|
23 |
+
o = []
|
24 |
+
|
25 |
+
for i in text:
|
26 |
+
if i[-2:] == "es":
|
27 |
+
o.append(i[:-2])
|
28 |
+
o.append("<es>")
|
29 |
+
else:
|
30 |
+
o.append(i)
|
31 |
+
return o
|
32 |
+
|
33 |
+
ind2text = ["<NULL>", "<UNK>", "<es>"]
|
34 |
+
text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2}
|
35 |
+
|
36 |
+
def fit_on_text(text: str):
|
37 |
+
global ind2text
|
38 |
+
global text2ind
|
39 |
+
tokens = tokenize_2str(text)
|
40 |
+
for i in tokens:
|
41 |
+
if i not in ind2text:
|
42 |
+
ind2text.append(i)
|
43 |
+
text2ind[i] = len(ind2text) - 1
|
44 |
+
|
45 |
+
def fit_on_texts(texts):
|
46 |
+
for text in texts: fit_on_text(text)
|
47 |
+
|
48 |
+
def tokenize(text: str):
|
49 |
+
text = tokenize_2str(text)
|
50 |
+
|
51 |
+
o = []
|
52 |
+
|
53 |
+
for i in text:
|
54 |
+
if i in ind2text:
|
55 |
+
o.append(text2ind[i])
|
56 |
+
else:
|
57 |
+
o.append(text2ind['<UNK>'])
|
58 |
+
return np.array(o)
|
59 |
+
|