|
import os |
|
import json |
|
import tensorflow as tf |
|
from tqdm import tqdm |
|
from GPT import * |
|
import pickle |
|
import argparse |
|
import sys |
|
|
|
|
|
|
|
def save_module(save_weights, model, vectorizer, save_tokenizer): |
|
|
|
|
|
with open(save_weights, 'wb') as file: |
|
pickle.dump(model.weights, file) |
|
|
|
|
|
vocabulary = vectorizer.get_vocabulary() |
|
|
|
|
|
encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary] |
|
encoded_vocabulary = encoded_vocabulary[2:] |
|
|
|
|
|
with open(save_tokenizer, 'w') as f: |
|
json.dump(encoded_vocabulary, f) |
|
print("Vocabulary size saved: " + str(len(encoded_vocabulary))) |
|
|
|
|
|
|
|
|
|
|
|
def read_file(f, vectorizer, chunk_size = 1024, starting_chunk = 0, ending_chunk = 5, gpt_input = 10): |
|
i = 0 |
|
chunk = [] |
|
|
|
while True: |
|
data = f.read(chunk_size) |
|
|
|
if not data or i > ending_chunk: |
|
break |
|
|
|
if i >= starting_chunk and i <= ending_chunk: |
|
file_contents = data.split() |
|
input_tokens, output_tokens = [], [] |
|
for j in range(len(file_contents) - gpt_input - 1): |
|
input_tokens += [file_contents[j : j + gpt_input]] |
|
output_tokens += [file_contents[j + gpt_input]] |
|
|
|
|
|
X = [' '.join(input_tokens[j]) for j in range(len(input_tokens))] |
|
Y = output_tokens |
|
|
|
X = vectorizer(X) |
|
Y = vectorizer(Y) |
|
|
|
output = tf.concat([X, Y], 1) |
|
|
|
yield output |
|
|
|
i += 1 |
|
|
|
|
|
def get_model(gpt_input, d_model, h, vocab_size, decoder_stacks, GPT_attention): |
|
input_words = tf.keras.layers.Input((gpt_input)) |
|
embedding = tf.keras.layers.Embedding(vocab_size + 2, d_model)(input_words) |
|
positional_enc = PositionalEmbedding(words = gpt_input, embedding_size = d_model)(embedding) |
|
decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(positional_enc) |
|
|
|
for _ in range(decoder_stacks - 1): |
|
decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(decoder) |
|
|
|
decoder = tf.keras.layers.Flatten()(decoder) |
|
linear_layer = tf.keras.layers.Dense(vocab_size + 3)(decoder) |
|
softmax = tf.nn.softmax(linear_layer) |
|
GPT = tf.keras.Model(inputs = input_words, outputs = softmax) |
|
|
|
return GPT |
|
|
|
|
|
def MinimalGPT(data_path='.', |
|
learning_rate=0, |
|
output_length=0, |
|
epochs = 1, |
|
batch_size = 1, |
|
gpt_input=10, |
|
d_model=128, |
|
h=8, |
|
decoder_stacks=1, |
|
starting_chunk = 0, |
|
ending_chunk = 5, |
|
chunk_size = 10, |
|
token_end=40000, |
|
vocabulary_start = 0, |
|
vocabulary_end = 40000, |
|
save=False, |
|
load_tokenizer=None, |
|
load_weights=None, |
|
save_tokenizer=None, |
|
save_weights=None, |
|
optimizer=None, |
|
inference_only = False, |
|
return_model_and_vectorizer = False, |
|
return_model_and_vectorizer_and_output = False, |
|
GPT_attention = False, |
|
TPU = False): |
|
|
|
if chunk_size: |
|
chunk_size *= 1024 |
|
|
|
|
|
if inference_only == False: |
|
with open(data_path, 'r', encoding = 'utf-8') as file: |
|
corpus = file.read() |
|
|
|
|
|
|
|
|
|
if load_tokenizer: |
|
with open(load_tokenizer, 'r') as f: |
|
encoded_vocabulary = json.load(f) |
|
|
|
|
|
vocabulary = [word.encode('utf-8').decode('unicode_escape') for word in encoded_vocabulary] |
|
vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace') |
|
vectorizer.set_vocabulary(vocabulary) |
|
vocab_size = vectorizer.vocabulary_size() |
|
|
|
else: |
|
vocab = [] |
|
for word in tqdm(corpus.split()[vocabulary_start : vocabulary_end]): |
|
vocab += [word] |
|
vocab = list(set(vocab)) |
|
vocab_size = len(vocab) |
|
vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace', vocabulary = vocab) |
|
print('New Vectorizer created successfully...') |
|
print("Vocabulary Size: " + str(vocab_size)) |
|
del corpus |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if load_weights: |
|
model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size - 2, GPT_attention = GPT_attention) |
|
|
|
with open(load_weights, 'rb') as file: |
|
W = pickle.load(file) |
|
model.set_weights(W) |
|
else: |
|
model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size, GPT_attention = GPT_attention) |
|
|
|
|
|
print(model.summary()) |
|
|
|
|
|
if inference_only == False: |
|
|
|
if not optimizer: |
|
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy') |
|
else: |
|
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy') |
|
|
|
|
|
if learning_rate > 0: |
|
|
|
for epoch in tqdm(range(epochs)): |
|
|
|
with open(data_path, 'r', encoding='utf-8') as f: |
|
chunk_number = 1 |
|
for chunk in read_file(f, |
|
vectorizer, |
|
chunk_size, |
|
starting_chunk, |
|
ending_chunk, |
|
gpt_input): |
|
print('Chunk_size: ' + str(chunk.shape[0])) |
|
model.fit(chunk[:, :gpt_input], tf.reshape(chunk[:, -1], (-1, 1)), batch_size = batch_size, epochs=1) |
|
print("Chunk Number " + str(chunk_number) + "/" +str(ending_chunk - starting_chunk + 1) + " processed!") |
|
chunk_number += 1 |
|
|
|
|
|
|
|
output_seq = generate_output(gpt_input = gpt_input, model = model, vectorizer = vectorizer, text_size = output_length, input_sequence = []) |
|
|
|
if save == True and TPU == False: |
|
print('Saveeeeee') |
|
|
|
save_module(save_weights, model, vectorizer, save_tokenizer) |
|
|
|
if save == True and TPU == True: |
|
|
|
return save_weights, model, vectorizer, save_tokenizer, output_seq |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if return_model_and_vectorizer: |
|
return model, vectorizer |
|
elif return_model_and_vectorizer_and_output: |
|
return model, vectorizer, output_seq.replace('@@ ', '') |
|
else: |
|
return output_seq.replace('@@ ', '') |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
print("This code is executed when the script file is called directly.") |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('-d', '--data-path', help='File: Corresponding to corpus or training text [String]') |
|
parser.add_argument('-l', '--learning-rate', help='Float: Learning Rate. The model will train ONLY IF the rate is > 0, skip otherwise [Float]', type=float) |
|
parser.add_argument('-ol', '--output-length', help='Length of the output sequence to be generated', type=int) |
|
parser.add_argument('-e', '--epochs', help='Number of training Epochs [Int]', type=int) |
|
parser.add_argument('-b', '--batch-size', help='Size of each batch [Int]', type=int) |
|
parser.add_argument('-s', '--gpt-input', help='Number of Tokens of text the model inputs at a time [Int]', type=int) |
|
parser.add_argument('-dm', '--d-model', help='Embedding layer output dimensions [Int]', type=int) |
|
parser.add_argument('-p', '--multi-head', help='Number of Multi-head Attention layer in parallel [Int]', type=int) |
|
parser.add_argument('-ds', '--decoder-stacks', help='Number of stacked Decoder layer [Int]', type=int) |
|
parser.add_argument('-sc', '--chunk-start', help='The chunk number in the corpus to mark it as the starting point of the training [Int]', type=int) |
|
parser.add_argument('-ec', '--chunk-end', help='The chunk number in the corpus to mark it as the end point of the training [Int]', type=int) |
|
parser.add_argument('-csz', '--chunk-size', help='The size of each chunk in KB.', type=int) |
|
parser.add_argument('-vs', '--vocabulary-start', help='Token number from the corpus to mark the starting point of vocabulary data [Int]', type=int) |
|
parser.add_argument('-ve', '--vocabulary-end', help='Token number from the corpus to mark the end point of vocabulary data [Int]', type=int) |
|
parser.add_argument('-sd', '--save', help='Save the Model and Vectorizer data to disk [True/False]', action='store_true') |
|
parser.add_argument('-lt', '--load-tokenizer', help='File: Vectorization layer [File]') |
|
parser.add_argument('-lw', '--load-weights', help='File: Model Weights [File]') |
|
parser.add_argument('-st', '--save-tokenizer', help='File: Saving Vectorizer File [File]') |
|
parser.add_argument('-sw', '--save-weights', help='File: Saving Model Weights[File]') |
|
parser.add_argument('-ot', '--optimizer', help='Optimizer consistent to TensorFlow optimizer class [tf.keras.optimizers]') |
|
parser.add_argument('-i', '--inference-only', help='Only Print the output of the model in Inference Mode [True/False]', action='store_true') |
|
parser.add_argument('-mv', '--model-vectorizer', help='Return Model, Vectorizer Tuple [True/False]', action='store_true') |
|
parser.add_argument('-mvo', '--model-vectorizer-output', help='Return Model, Vectorizer, Output Tuple [True/False]', action='store_true') |
|
parser.add_argument('-ga', '--gpt-style-attention', help='Uses GPT-styled attention. Note: (d-model) parameter should be divisible by (multi-head), otherwise the program will throw an error! [True/False]', action='store_true') |
|
parser.add_argument('-tpu', '--TPU', help='Use Tensor Processor Units (Distributed Learning)', action='store_true') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
data_path = args.data_path |
|
learning_rate = args.learning_rate |
|
output_length = args.output_length |
|
epochs = args.epochs |
|
batch_size = args.batch_size |
|
gpt_input = args.gpt_input |
|
d_model = args.d_model |
|
h = args.multi_head |
|
stacks = args.decoder_stacks |
|
chunk_start = args.chunk_start |
|
chunk_end = args.chunk_end |
|
chunk_size = args.chunk_size |
|
vocabulary_start = args.vocabulary_start |
|
vocabulary_end = args.vocabulary_end |
|
save = args.save |
|
load_tokenizer = args.load_tokenizer |
|
load_weights = args.load_weights |
|
save_tokenizer = args.save_tokenizer |
|
save_weights = args.save_weights |
|
optimizer = args.optimizer |
|
inference_only = args.inference_only |
|
model_and_vectorizer = args.model_vectorizer |
|
GPT_attention = args.gpt_style_attention |
|
model_vectorizer_output = args.model_vectorizer_output |
|
|
|
|
|
|
|
configuration = { |
|
'data_path': args.data_path, |
|
'learning_rate': args.learning_rate, |
|
'output_length': args.output_length, |
|
'epochs': args.epochs, |
|
'batch_size': args.batch_size, |
|
'gpt_input': args.gpt_input, |
|
'd_model': args.d_model, |
|
'h': args.multi_head, |
|
'stacks': args.decoder_stacks, |
|
'chunk_start': args.chunk_start, |
|
'chunk_end': args.chunk_end, |
|
'chunk_size': args.chunk_size, |
|
'vocabulary_start': args.vocabulary_start, |
|
'vocabulary_end': args.vocabulary_end, |
|
'save': args.save, |
|
'load_tokenizer': args.load_tokenizer, |
|
'load_weights': args.load_weights, |
|
'save_tokenizer': args.save_tokenizer, |
|
'save_weights': args.save_weights, |
|
'optimizer': args.optimizer, |
|
'inference_only': args.inference_only, |
|
'model_and_vectorizer': args.model_vectorizer, |
|
'model_vectorizer_output': args.model_vectorizer_output, |
|
'GPT_Attention' : args.gpt_style_attention |
|
} |
|
|
|
|
|
with open('last-configuration.json', 'w') as file: |
|
json.dump(configuration, file) |
|
|
|
|
|
|
|
if args.TPU == True: |
|
|
|
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') |
|
tf.config.experimental_connect_to_cluster(resolver) |
|
|
|
tf.tpu.experimental.initialize_tpu_system(resolver) |
|
print("All devices: ", tf.config.list_logical_devices('TPU')) |
|
|
|
|
|
strategy = tf.distribute.TPUStrategy(resolver) |
|
|
|
with strategy.scope(): |
|
|
|
output = MinimalGPT(data_path = data_path, |
|
learning_rate = learning_rate, |
|
output_length = output_length, |
|
epochs = epochs, |
|
batch_size = batch_size, |
|
gpt_input = gpt_input, |
|
d_model = d_model, |
|
h = h, |
|
decoder_stacks = stacks, |
|
starting_chunk = chunk_start, |
|
ending_chunk = chunk_end, |
|
chunk_size = chunk_size, |
|
vocabulary_start = vocabulary_start, |
|
vocabulary_end = vocabulary_end, |
|
save = save, |
|
load_tokenizer = load_tokenizer, |
|
load_weights = load_weights, |
|
save_tokenizer = save_tokenizer, |
|
save_weights = save_weights, |
|
optimizer = optimizer, |
|
inference_only = inference_only, |
|
return_model_and_vectorizer = model_and_vectorizer, |
|
return_model_and_vectorizer_and_output = model_vectorizer_output, |
|
GPT_attention = GPT_attention, |
|
TPU = True) |
|
|
|
save_module(output[0], output[1], output[2], output[3]) |
|
|
|
print(output[4]) |
|
sys.exit(0) |
|
|
|
|
|
output = MinimalGPT(data_path = data_path, |
|
learning_rate = learning_rate, |
|
output_length = output_length, |
|
epochs = epochs, |
|
batch_size = batch_size, |
|
gpt_input = gpt_input, |
|
d_model = d_model, |
|
h = h, |
|
decoder_stacks = stacks, |
|
starting_chunk = chunk_start, |
|
ending_chunk = chunk_end, |
|
chunk_size = chunk_size, |
|
vocabulary_start = vocabulary_start, |
|
vocabulary_end = vocabulary_end, |
|
save = save, |
|
load_tokenizer = load_tokenizer, |
|
load_weights = load_weights, |
|
save_tokenizer = save_tokenizer, |
|
save_weights = save_weights, |
|
optimizer = optimizer, |
|
inference_only = inference_only, |
|
return_model_and_vectorizer = model_and_vectorizer, |
|
return_model_and_vectorizer_and_output = model_vectorizer_output, |
|
GPT_attention = GPT_attention, |
|
TPU = False) |
|
print(output) |