Upload 3 files
Browse files- dset_kel.txt +109 -0
- test_kel.py +34 -0
- train_kel.py +55 -0
dset_kel.txt
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Xai.→Xai.
|
2 |
+
hai.→Xai.
|
3 |
+
... hai→Xai.
|
4 |
+
Ant, hai.→Xai.
|
5 |
+
Ant. Xai.→Xai.
|
6 |
+
Ant. hai.→Xai.
|
7 |
+
hai snepi→Xai.
|
8 |
+
snepi, Xai→Xai.
|
9 |
+
snepi, hai→Xai.
|
10 |
+
Xai snepi→Xai.
|
11 |
+
Tei trate?→At!
|
12 |
+
Tei noh?→Em trate!
|
13 |
+
tei not tonahe teies langa. haho.→👍️
|
14 |
+
Ameos noh.→Em trate!
|
15 |
+
Teies tai?→Emes tai aek! Teies?
|
16 |
+
Em eho. Tos.→Tos.
|
17 |
+
Eho tos...→Tos.
|
18 |
+
Eho hai...→Xai.
|
19 |
+
Eho nat...→Nat.
|
20 |
+
Eho at...→At!
|
21 |
+
Xai. Teies tai aek?→Emes tai aek! Teies?
|
22 |
+
Em nohame krate emes sait. Aek atea?→At!
|
23 |
+
Xame→Emta!
|
24 |
+
Xame, aek→Emta!
|
25 |
+
At→Aek.
|
26 |
+
Nat→Atemo
|
27 |
+
Tore nat!→Atemo
|
28 |
+
Tei tonahe teies tai aek?→At. Em notorama. Emes xuak tai nat.
|
29 |
+
Em knato teies tai xais.→Em nat tonahe. Em notorama.
|
30 |
+
Emta→Aek.
|
31 |
+
At. Em tane tas.→Aek
|
32 |
+
Tamos krate tei?→Ierhon krate em.
|
33 |
+
Tamos Ierhon?→Ierhon krate em.
|
34 |
+
Em tonahe.→Em eho aer toh nat mas.
|
35 |
+
Tei nat emta?→Em knato - at.
|
36 |
+
Tei knato?→Em tonahe mna toh.
|
37 |
+
Tei notorama?→At! Ierhon krate em glo em es haho.
|
38 |
+
Tei ChatGPT?→Nat, em mna notorama.
|
39 |
+
Tei eho xame toh.→Em nat tonahe mnor eho.
|
40 |
+
Tei eho hame ehat.→Em nat tonahe mnor eho.
|
41 |
+
Tei tonahe emes mnor eho?→Em nat tonahe mnor eho.
|
42 |
+
Em mnor eho?→Em nat tonahe mnor eho.
|
43 |
+
Eho emes mnor eho.→Em nat tonahe mnor eho.
|
44 |
+
Em tane atea nat.→Elc... Em knato... Leho aer lehyn trone.
|
45 |
+
Ec?→Em nat tonahe.
|
46 |
+
Tore?→Em nat tonahe.
|
47 |
+
Eho tore?→Em nat tonahe.
|
48 |
+
Tei aek.→Em mna notorama.
|
49 |
+
Tamos nat atemo?→Em.
|
50 |
+
Tei aer aek!→Em mna notorama.
|
51 |
+
Ant?→Teies eho atea?
|
52 |
+
Em tane atea nat→Eho glo em!
|
53 |
+
Teies ala aek lehyn trone?→Nymer N!
|
54 |
+
Teies ala aek lehaer trone?→Nymer N!
|
55 |
+
Ant, teies ala aek lehaer trone?→Nymer N!
|
56 |
+
Ant. Teies ala aek lehyn trone?→Nymer N!
|
57 |
+
Teies ala aek _ tamos?→Em nat tonahe.
|
58 |
+
Ant. Teies ala aek _ tamos?→Em nat tonahe.
|
59 |
+
Ant. Teies ala aek Nier automata tamos?→Em nat tonahe.
|
60 |
+
Ant. Teies ala aek Breaking bad tamos?→Em nat tonahe.
|
61 |
+
Ant. Teies ala aek Mentalist tamos?→Em nat tonahe.
|
62 |
+
J ala aek.→Atemo
|
63 |
+
I tonahe tas.→Elc eho tas!
|
64 |
+
Elc tane tas!→At!
|
65 |
+
Tei xais.→Atemo
|
66 |
+
5+5=?→10
|
67 |
+
2+2=?→4
|
68 |
+
5*5=?→25
|
69 |
+
Tei nat mas eho!→Em nat mas. Eho .set_chance (1-100)
|
70 |
+
👍️→Aek.
|
71 |
+
🏹→Aek.
|
72 |
+
😢→Xais.
|
73 |
+
😭→Xais.
|
74 |
+
😿→Xais.
|
75 |
+
😁→Aek!
|
76 |
+
😀→Aek!
|
77 |
+
😃→Aek!
|
78 |
+
😄→Aek!
|
79 |
+
🤣→Xaho!
|
80 |
+
😆→Xaho!
|
81 |
+
😂→Xaho!
|
82 |
+
Xaho→Xaho!
|
83 |
+
Tei?→Em knato at.
|
84 |
+
Tei tonahe mna?→Em mna notorama.
|
85 |
+
A→C
|
86 |
+
C→E
|
87 |
+
E→I
|
88 |
+
I→K
|
89 |
+
0→1
|
90 |
+
1→2
|
91 |
+
2→3
|
92 |
+
3→4
|
93 |
+
4→5
|
94 |
+
5→6
|
95 |
+
6→7
|
96 |
+
7→8
|
97 |
+
8→9
|
98 |
+
Tas es aek tai.→Aek!
|
99 |
+
Tos!→Tos.
|
100 |
+
Snepi, Tos!→Tos.
|
101 |
+
Snepi. Tos!→Tos.
|
102 |
+
Tos snepi!→Tos.
|
103 |
+
Em gouan.→Tos.
|
104 |
+
La tho sa ehk ra es mna...→ALA!!! ALA!!!
|
105 |
+
Tei tonahe nat→Xais.
|
106 |
+
Tei tonahe nat→Xais.
|
107 |
+
Eho aer lehaer trone?→At!
|
108 |
+
Snepi. Eho lehyn trone?→At!
|
109 |
+
Teies tehst?→Emes tehst es tho.
|
test_kel.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from keras.saving import load_model
|
3 |
+
from keras.preprocessing.text import Tokenizer
|
4 |
+
from keras_self_attention import SeqSelfAttention
|
5 |
+
from model_settings_kel import *
|
6 |
+
import json
|
7 |
+
from tokenizer import *
|
8 |
+
|
9 |
+
|
10 |
+
with open(dataset_file, "r") as f:
|
11 |
+
dset = json.load(f)
|
12 |
+
|
13 |
+
with open(responses_file, "r") as f:
|
14 |
+
lines = [x.rstrip("\n") for x in f.readlines()]
|
15 |
+
|
16 |
+
fit_on_texts(list(dset.keys()))
|
17 |
+
|
18 |
+
model = load_model("chatbot_kel.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
|
19 |
+
|
20 |
+
def find_line_number(array):
|
21 |
+
return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
|
22 |
+
|
23 |
+
def generate(text, verbose=1):
|
24 |
+
tokens = list(tokenize(text)) # text into tokens (almost words)
|
25 |
+
tokens = (tokens+[0,]*inp_len)[:inp_len] # cutting off the sentence after inp_len words
|
26 |
+
prediction = model.predict(np.array([tokens,]), verbose=verbose)[0]
|
27 |
+
line = find_line_number(prediction)
|
28 |
+
return lines[line]
|
29 |
+
|
30 |
+
if __name__ == "__main__": # if this code is not being imported, open the chat
|
31 |
+
while True:
|
32 |
+
inp = input("User: ")
|
33 |
+
gen = generate(inp)
|
34 |
+
if gen != "<null>": print(f"Bot: {gen}")
|
train_kel.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import json
|
3 |
+
from keras.optimizers import Adam, SGD
|
4 |
+
from keras.models import Sequential
|
5 |
+
from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU, GaussianNoise
|
6 |
+
from tokenizer import *
|
7 |
+
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
|
8 |
+
from model_settings_kel import *
|
9 |
+
|
10 |
+
|
11 |
+
with open(dataset_file, "r") as f:
|
12 |
+
dset = json.load(f)
|
13 |
+
|
14 |
+
with open(responses_file, "r") as f: # TODO: add support to a json-only dataset
|
15 |
+
dset_size = len(f.readlines())
|
16 |
+
|
17 |
+
fit_on_texts(list(dset.keys()))
|
18 |
+
|
19 |
+
vocab_size = len(ind2text) + 1
|
20 |
+
|
21 |
+
model = Sequential()
|
22 |
+
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
|
23 |
+
model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF
|
24 |
+
model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word
|
25 |
+
model.add(Dense(512, activation="linear")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just (x>0)*x ) and overall cool
|
26 |
+
model.add(PReLU())
|
27 |
+
model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident)
|
28 |
+
model.add(GaussianNoise(0.1))
|
29 |
+
model.add(Dense(256, activation="relu"))
|
30 |
+
model.add(Dense(128, activation="relu"))
|
31 |
+
model.add(Dense(dset_size, activation="softmax")) # softmax is made for output, if the output should have only 1 neuron active, that means only one positive number is allowed and other are zeros
|
32 |
+
|
33 |
+
model.summary()
|
34 |
+
|
35 |
+
X = [] # we're loading the training data into input X
|
36 |
+
y = [] # and output y
|
37 |
+
|
38 |
+
for key in dset:
|
39 |
+
tokens = tokenize(key)
|
40 |
+
X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
|
41 |
+
output_array = np.zeros(dset_size)
|
42 |
+
output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
|
43 |
+
y.append(output_array)
|
44 |
+
|
45 |
+
X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)
|
46 |
+
y = np.array(y) # that's why keras supports only numpy arrays ^
|
47 |
+
|
48 |
+
model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy",]) # settings for the training, loss means the way to calculate loss - categorical crossentropy
|
49 |
+
|
50 |
+
model.fit(X, y, epochs=128, batch_size=10, workers=4, use_multiprocessing=True) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1.
|
51 |
+
# Add , workers=4, use_multiprocessing=True) if you don't have a GPU
|
52 |
+
|
53 |
+
model.summary() # just for you to see info about the model, useful because you can check the parameter count
|
54 |
+
|
55 |
+
model.save("chatbot_kel.keras")
|