ierhon commited on
Commit
d80c106
1 Parent(s): 55824d8

Upload 5 files

Browse files
Files changed (5) hide show
  1. chatbot_kel.py +34 -0
  2. dataset_kel.json +1 -0
  3. model_settings_kel.py +4 -0
  4. responses_kel.txt +47 -0
  5. tokenizer.py +59 -0
chatbot_kel.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from keras.saving import load_model
3
+ from keras.preprocessing.text import Tokenizer
4
+ from keras_self_attention import SeqSelfAttention
5
+ from model_settings_kel import *
6
+ import json
7
+ from tokenizer import *
8
+
9
+
10
+ with open(dataset_file, "r") as f:
11
+ dset = json.load(f)
12
+
13
+ with open(responses_file, "r") as f:
14
+ lines = [x.rstrip("\n") for x in f.readlines()]
15
+
16
+ fit_on_texts(list(dset.keys()))
17
+
18
+ model = load_model("chatbot_kel.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
19
+
20
+ def find_line_number(array):
21
+ return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
22
+
23
+ def generate(text, verbose=1):
24
+ tokens = list(tokenize(text.lower())) # text into tokens (almost words)
25
+ tokens = (tokens+[0,]*inp_len)[:inp_len] # cutting off the sentence after inp_len words
26
+ prediction = model.predict(np.array([tokens,]), verbose=verbose)[0]
27
+ line = find_line_number(prediction)
28
+ return lines[line]
29
+
30
+ if __name__ == "__main__": # if this code is not being imported, open the chat
31
+ while True:
32
+ inp = input("User: ")
33
+ gen = generate(inp)
34
+ if gen != "<null>": print(f"Bot: {gen}")
dataset_kel.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"xai.": 0, "hai.": 0, "... hai": 0, "ant, hai.": 0, "ant. xai.": 0, "ant. hai.": 0, "hai snepi": 0, "snepi, xai": 0, "snepi, hai": 0, "xai snepi": 0, "tei trate?": 1, "tei noh?": 2, "teies tai?": 3, "xai. teies tai?": 3, "at": 4, "nat": 5, "tore nat!": 5, "tei tonahe teies tai aek?": 6, "em knato teies tai xais.": 7, "emta": 4, "at. em tane tas.": 8, "tamos krate tei?": 9, "tamos ierhon?": 9, "em tonahe.": 10, "tei nat emta?": 11, "tei knato?": 12, "tei notorama?": 13, "tei chatgpt?": 14, "tei eho hame ehat.": 15, "tei tonahe emes mnor eho?": 15, "em mnor eho?": 15, "eho emes mnor eho.": 15, "em tane atea nat.": 16, "ec?": 17, "tore?": 17, "eho tore?": 17, "tei aek.": 18, "tamos nat atemo?": 19, "tei aer aek!": 18, "ant?": 20, "em tane atea nat": 21, "teies aek lehyn trone?": 22, "teies aek lehaer trone?": 22, "ant, teies aek lehaer trone?": 22, "ant. teies aek lehyn trone?": 22, "teies aek _ tamos?": 17, "ant. teies aek _ tamos?": 17, "ant. teies aek nier automata tamos?": 17, "ant. teies aek breaking bad tamos?": 17, "ant. teies aek mentalist tamos?": 17, "j aek.": 5, "i tonahe tas.": 23, "elc tane tas!": 1, "tei xais.": 5, "5+5=?": 24, "2+2=?": 25, "5*5=?": 26, "tei nat mas eho!": 27, "👍️": 4, "🏹": 4, "😢": 28, "😭": 28, "😿": 28, "😁": 29, "😀": 29, "😃": 29, "😄": 29, "🤣": 30, "😆": 30, "😂": 30, "xaho": 30, "tei?": 31, "tei tonahe mna?": 18, "a": 32, "c": 33, "e": 34, "i": 35, "0": 36, "1": 37, "2": 38, "3": 25, "4": 39, "5": 40, "6": 41, "7": 42, "8": 43, "tas es aek tai.": 29, "tos!": 44, "snepi, tos!": 44, "snepi. tos!": 44, "tos snepi!": 44, "em gouan.": 44, "la tho sa ehk ra es mna...": 45, "tei tonahe nat": 28, "eho aer lehaer trone?": 1, "snepi. eho lehyn trone?": 1, "teies tehst?": 46}
model_settings_kel.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ dataset_file = "dataset_kel.json"
2
+ responses_file = "responses_kel.txt"
3
+ emb_size = 128 # how big are the word vectors in the input (how much information can be fit into one word)
4
+ inp_len = 10 # limit of the input length, after 10 words the
responses_kel.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ xai.
2
+ at!
3
+ em trate!
4
+ emes tai aek! teies?
5
+ aek.
6
+ atemo
7
+ at. em notorama. emes xuak tai nat.
8
+ em nat tonahe. em notorama.
9
+ aek
10
+ ierhon krate em.
11
+ em eho aer toh nat mas.
12
+ em knato - at.
13
+ em tonahe mna toh.
14
+ at! ierhon krate em glo em es haho.
15
+ nat, em mna notorama.
16
+ em nat tonahe mnor eho.
17
+ elc... em knato... leho aer lehyn trone.
18
+ em nat tonahe.
19
+ em mna notorama.
20
+ em.
21
+ teies eho atea?
22
+ eho glo em!
23
+ nymer n!
24
+ elc eho tas!
25
+ 10
26
+ 4
27
+ 25
28
+ em nat mas. eho .set_chance (1-100)
29
+ xais.
30
+ aek!
31
+ xaho!
32
+ em knato at.
33
+ c
34
+ e
35
+ i
36
+ k
37
+ 1
38
+ 2
39
+ 3
40
+ 5
41
+ 6
42
+ 7
43
+ 8
44
+ 9
45
+ tos.
46
+ ala!!! ala!!!
47
+ emes tehst es 97.
tokenizer.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→"
4
+
5
+ def split(text):
6
+ o = []
7
+ t = ""
8
+ for i in text+" ":
9
+ if i in s:
10
+ if t != "":
11
+ o.append(t)
12
+ t = ""
13
+ if i != " ":
14
+ o.append(i)
15
+ t = ""
16
+ else:
17
+ t += i
18
+ return o
19
+
20
+ def tokenize_2str(text: str):
21
+ text = split(text)
22
+
23
+ o = []
24
+
25
+ for i in text:
26
+ if i[-2:] == "es":
27
+ o.append(i[:-2])
28
+ o.append("<es>")
29
+ else:
30
+ o.append(i)
31
+ return o
32
+
33
+ ind2text = ["<NULL>", "<UNK>", "<es>"]
34
+ text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2}
35
+
36
+ def fit_on_text(text: str):
37
+ global ind2text
38
+ global text2ind
39
+ tokens = tokenize_2str(text)
40
+ for i in tokens:
41
+ if i not in ind2text:
42
+ ind2text.append(i)
43
+ text2ind[i] = len(ind2text) - 1
44
+
45
+ def fit_on_texts(texts):
46
+ for text in texts: fit_on_text(text)
47
+
48
+ def tokenize(text: str):
49
+ text = tokenize_2str(text)
50
+
51
+ o = []
52
+
53
+ for i in text:
54
+ if i in ind2text:
55
+ o.append(text2ind[i])
56
+ else:
57
+ o.append(text2ind['<UNK>'])
58
+ return np.array(o)
59
+