carlesoctav's picture
Update train.py
d3b88f1
import tensorflow as tf
from transformers import TFXLMRobertaModel, AutoTokenizer, TFAutoModel
from datasets import load_dataset, concatenate_datasets
from datetime import datetime
import logging
from pyprojroot.here import here
class mean_pooling_layer(tf.keras.layers.Layer):
def __init__(self):
super(mean_pooling_layer, self).__init__()
def call(self, inputs):
token_embeddings = inputs[0]
attention_mask = inputs[1]
input_mask_expanded = tf.cast(
tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
tf.float32
)
embeddings = tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1) / tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)
return embeddings
def get_config(self):
config = super(mean_pooling_layer, self).get_config()
return config
def create_model():
base_student_model = TFAutoModel.from_pretrained("nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",from_pt=True)
input_ids_en = tf.keras.layers.Input(shape=(256,),name='input_ids_en', dtype=tf.int32)
attention_mask_en = tf.keras.layers.Input(shape=(256,), name='attention_mask_en', dtype=tf.int32)
input_ids_id = tf.keras.layers.Input(shape=(256,),name='input_ids_id', dtype=tf.int32)
attention_mask_id = tf.keras.layers.Input(shape=(256,), name='attention_mask_id', dtype=tf.int32)
output_en = base_student_model.roberta(input_ids_en, attention_mask=attention_mask_en).last_hidden_state[:,0,:]
output_id = base_student_model.roberta(input_ids_id, attention_mask=attention_mask_id).last_hidden_state[:,0,:]
student_model = tf.keras.Model(inputs=[input_ids_en, attention_mask_en, input_ids_id, attention_mask_id], outputs=[output_en, output_id])
student_model.load_weights("disk/model/2023-05-25_07-52-43/multiqa-Mmini-L6-H384.h5")
return student_model
class sentence_translation_metric(tf.keras.callbacks.Callback):
def on_epoch_end(self,epoch,logs):
embeddings_en, embeddings_id = self.model.predict(val_dataset, verbose=1)
# get the embeddings
# compute the cosine similarity between the two
#normalize the embeddings
embeddings_en = tf.math.l2_normalize(embeddings_en, axis=1)
embeddings_id = tf.math.l2_normalize(embeddings_id, axis=1)
similarity_matrix = tf.matmul(embeddings_en, embeddings_id, transpose_b=True)
# get the mean similarity
correct_en_id = 0
for i in range(similarity_matrix.shape[0]):
if tf.math.argmax(similarity_matrix[i]) == i:
correct_en_id += 1
similarity_matrix_T = tf.transpose(similarity_matrix)
correct_id_en = 0
for i in range(similarity_matrix_T.shape[0]):
if tf.math.argmax(similarity_matrix_T[i]) == i:
correct_id_en += 1
acc_en_id = correct_en_id / similarity_matrix.shape[0]
acc_id_en = correct_id_en / similarity_matrix_T.shape[0]
avg_acc = (acc_en_id + acc_id_en) / 2
print(f"translation accuracy from english to indonesian = {acc_en_id}")
print(f"translation accuracy from indonesian to english = {acc_id_en}")
print(f"average translation accuracy = {avg_acc}")
logs["val_acc_en_id"] = acc_en_id
logs["val_acc_id_en"] = acc_id_en
logs["val_avg_acc"] = avg_acc
class ConstantScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, max_lr, warmup_steps=5000):
super().__init__()
self.max_lr = tf.cast(max_lr, tf.float32)
self.warmup_steps = warmup_steps
def __call__(self, step):
step = tf.cast(step, tf.float32)
condition = tf.cond(step < self.warmup_steps, lambda: step / self.warmup_steps, lambda: 1.0)
return self.max_lr * condition
if __name__ == "__main__":
num_data = 0
dataset = load_dataset("carlesoctav/en-id-parallel-sentences-embedding")
dataset_1 = dataset["combinedtech"]
for split in dataset:
dataset_1 = concatenate_datasets([dataset_1, dataset[split]])
batch_size = 512
dataset = dataset_1.train_test_split(test_size=0.005, shuffle=True)
train_dataset = dataset["train"]
val_dataset = dataset["test"]
print(val_dataset.shape)
train_dataset = train_dataset.to_tf_dataset(
columns=["input_ids_en", "attention_mask_en", "input_ids_id", "attention_mask_id"],
label_cols="target_embedding",
batch_size=batch_size,
).unbatch()
val_dataset = val_dataset.to_tf_dataset(
columns=["input_ids_en", "attention_mask_en", "input_ids_id", "attention_mask_id"],
label_cols="target_embedding",
batch_size=batch_size,
).unbatch()
#check feature
print(train_dataset.element_spec)
print(val_dataset.element_spec)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True).cache()
val_dataset = val_dataset.batch(batch_size, drop_remainder=True).cache()
learning_rate = ConstantScheduler(1e-3, warmup_steps=10000)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
epsilon=1e-9)
loss = tf.keras.losses.MeanSquaredError()
date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_path = here(f"disk/model/{date_time}/multiqa-Mmini-L6-H384.h5")
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
filepath = output_path,
save_weights_only = True,
monitor = "val_avg_acc",
mode = 'auto',
verbose = 1,
save_best_only = True,
initial_value_threshold = 0.5,
)
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor = "val_avg_acc",
mode = 'auto',
restore_best_weights=False,
patience = 4,
verbose=1,
start_from_epoch = 5,
)
# tensor_board = tf.keras.callbacks.TensorBoard(
# log_dir = "gs://dicoding-capstone/output/logs/"+date_time
# )
csv_logger = tf.keras.callbacks.CSVLogger(
filename = here(f"disk/performance_logs/log-{date_time}.csv"),
separator = ",",
append = False
)
callbacks = [sentence_translation_metric(), model_checkpoint, csv_logger,early_stopping]
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver("local")
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
strategy = tf.distribute.TPUStrategy(cluster_resolver)
with strategy.scope():
student_model = create_model()
student_model.compile(optimizer=optimizer, loss=loss)
student_model.fit(train_dataset, epochs=20, validation_data=val_dataset, callbacks=callbacks)
last_epoch_save = here(f"disk/model/last_epoch/{date_time}.h5")
student_model.save_weights(last_epoch_save)