DemoRepo / langTraining.py
Language's picture
Initial
75487c0
#Load all the required modules.
from tensorflow.keras.models import Model
from tensorflow.keras import models
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input,LSTM,Dense
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle
#initialize all variables
input_texts=[]
target_texts=[]
input_characters=set()
target_characters=set()
#read dataset file
with open('eng-french.txt','r',encoding='utf-8') as f:
rows=f.read().split('\n')
#read first 10,000 rows from dataset
for row in rows[:10000]:
#split input and target by '\t'=tab
input_text,target_text = row.split('\t')
#add '\t' at start and '\n' at end of text.
target_text='\t' + target_text + '\n'
input_texts.append(input_text.lower())
target_texts.append(target_text.lower())
#split character from text and add in respective sets
input_characters.update(list(input_text.lower()))
target_characters.update(list(target_text.lower()))
#sort input and target characters
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
#get the total length of input and target characters
num_en_chars = len(input_characters)
num_dec_chars = len(target_characters)
#get the maximum length of input and target text.
max_input_length = max([len(i) for i in input_texts])
max_target_length = max([len(i) for i in target_texts])
def bagofcharacters(input_texts,target_texts):
#inintialize encoder , decoder input and target data.
en_in_data=[] ; dec_in_data=[] ; dec_tr_data=[]
#padding variable with first character as 1 as rest all 0.
pad_en=[1]+[0]*(len(input_characters)-1)
pad_dec=[0]*(len(target_characters)) ; pad_dec[2]=1
#countvectorizer for one hot encoding as we want to tokenize character so
#anlyzer is true and None the stopwords action.
cv=CountVectorizer(binary=True,tokenizer=lambda txt: txt.split(),stop_words=None,analyzer='char')
for i,(input_t,target_t) in enumerate(zip(input_texts,target_texts)):
#fit the input characters into the CountVectorizer function
cv_inp= cv.fit(input_characters)
#transform the input text from the help of CountVectorizer fit.
#it character present than put 1 and 0 otherwise.
en_in_data.append(cv_inp.transform(list(input_t)).toarray().tolist())
cv_tar= cv.fit(target_characters)
dec_in_data.append(cv_tar.transform(list(target_t)).toarray().tolist())
#decoder target will be one timestep ahead because it will not consider
#the first character i.e. '\t'.
dec_tr_data.append(cv_tar.transform(list(target_t)[1:]).toarray().tolist())
#add padding variable if the length of the input or target text is smaller
#than their respective maximum input or target length.
if len(input_t) < max_input_length:
for _ in range(max_input_length-len(input_t)):
en_in_data[i].append(pad_en)
if len(target_t) < max_target_length:
for _ in range(max_target_length-len(target_t)):
dec_in_data[i].append(pad_dec)
if (len(target_t)-1) < max_target_length:
for _ in range(max_target_length-len(target_t)+1):
dec_tr_data[i].append(pad_dec)
#convert list to numpy array with data type float32
en_in_data=np.array(en_in_data,dtype="float32")
dec_in_data=np.array(dec_in_data,dtype="float32")
dec_tr_data=np.array(dec_tr_data,dtype="float32")
return en_in_data,dec_in_data,dec_tr_data
#create input object of total number of encoder characters
en_inputs = Input(shape=(None, num_en_chars))
#create LSTM with the hidden dimension of 256
#return state=True as we don't want output sequence.
encoder = LSTM(256, return_state=True)
#discard encoder output and store hidden and cell state.
en_outputs, state_h, state_c = encoder(en_inputs)
en_states = [state_h, state_c]
#create input object of total number of decoder characters
dec_inputs = Input(shape=(None, num_dec_chars))
#create LSTM with the hidden dimension of 256
#return state and return sequences as we want output sequence.
dec_lstm = LSTM(256, return_sequences=True, return_state=True)
#initialize the decoder model with the states on encoder.
dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states)
#Output layer with shape of total number of decoder characters
dec_dense = Dense(num_dec_chars, activation="softmax")
dec_outputs = dec_dense(dec_outputs)
#create Model and store all variables
model = Model([en_inputs, dec_inputs], dec_outputs)
pickle.dump({'input_characters':input_characters,'target_characters':target_characters,
'max_input_length':max_input_length,'max_target_length':max_target_length,
'num_en_chars':num_en_chars,'num_dec_chars':num_dec_chars},open("training_data.pkl","wb"))
#load the data and train the model
# en_in_data,dec_in_data,dec_tr_data = bagofcharacters(input_texts,target_texts)
# model.compile(
# optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
# )
# model.fit(
# [en_in_data, dec_in_data],
# dec_tr_data,
# batch_size=64,
# epochs=200,
# validation_split=0.2,
# )
# Save model
model.save("s2s")cd
#summary and model plot
model.summary()
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)