|
import tensorflow as tf |
|
from transformers import TFXLMRobertaModel, AutoTokenizer, TFAutoModel |
|
from datasets import load_dataset, concatenate_datasets |
|
from datetime import datetime |
|
import logging |
|
from pyprojroot.here import here |
|
|
|
class mean_pooling_layer(tf.keras.layers.Layer): |
|
def __init__(self): |
|
super(mean_pooling_layer, self).__init__() |
|
|
|
def call(self, inputs): |
|
token_embeddings = inputs[0] |
|
attention_mask = inputs[1] |
|
input_mask_expanded = tf.cast( |
|
tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)), |
|
tf.float32 |
|
) |
|
|
|
embeddings = tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1) / tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max) |
|
return embeddings |
|
|
|
def get_config(self): |
|
config = super(mean_pooling_layer, self).get_config() |
|
return config |
|
|
|
|
|
def create_model(): |
|
base_student_model = TFAutoModel.from_pretrained("nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",from_pt=True) |
|
input_ids_en = tf.keras.layers.Input(shape=(256,),name='input_ids_en', dtype=tf.int32) |
|
attention_mask_en = tf.keras.layers.Input(shape=(256,), name='attention_mask_en', dtype=tf.int32) |
|
input_ids_id = tf.keras.layers.Input(shape=(256,),name='input_ids_id', dtype=tf.int32) |
|
attention_mask_id = tf.keras.layers.Input(shape=(256,), name='attention_mask_id', dtype=tf.int32) |
|
|
|
output_en = base_student_model.roberta(input_ids_en, attention_mask=attention_mask_en).last_hidden_state[:,0,:] |
|
output_id = base_student_model.roberta(input_ids_id, attention_mask=attention_mask_id).last_hidden_state[:,0,:] |
|
|
|
student_model = tf.keras.Model(inputs=[input_ids_en, attention_mask_en, input_ids_id, attention_mask_id], outputs=[output_en, output_id]) |
|
student_model.load_weights("disk/model/2023-05-25_07-52-43/multiqa-Mmini-L6-H384.h5") |
|
return student_model |
|
|
|
class sentence_translation_metric(tf.keras.callbacks.Callback): |
|
def on_epoch_end(self,epoch,logs): |
|
embeddings_en, embeddings_id = self.model.predict(val_dataset, verbose=1) |
|
|
|
|
|
|
|
embeddings_en = tf.math.l2_normalize(embeddings_en, axis=1) |
|
embeddings_id = tf.math.l2_normalize(embeddings_id, axis=1) |
|
similarity_matrix = tf.matmul(embeddings_en, embeddings_id, transpose_b=True) |
|
|
|
correct_en_id = 0 |
|
for i in range(similarity_matrix.shape[0]): |
|
if tf.math.argmax(similarity_matrix[i]) == i: |
|
correct_en_id += 1 |
|
|
|
similarity_matrix_T = tf.transpose(similarity_matrix) |
|
correct_id_en = 0 |
|
for i in range(similarity_matrix_T.shape[0]): |
|
if tf.math.argmax(similarity_matrix_T[i]) == i: |
|
correct_id_en += 1 |
|
|
|
acc_en_id = correct_en_id / similarity_matrix.shape[0] |
|
acc_id_en = correct_id_en / similarity_matrix_T.shape[0] |
|
avg_acc = (acc_en_id + acc_id_en) / 2 |
|
print(f"translation accuracy from english to indonesian = {acc_en_id}") |
|
print(f"translation accuracy from indonesian to english = {acc_id_en}") |
|
print(f"average translation accuracy = {avg_acc}") |
|
|
|
logs["val_acc_en_id"] = acc_en_id |
|
logs["val_acc_id_en"] = acc_id_en |
|
logs["val_avg_acc"] = avg_acc |
|
|
|
|
|
class ConstantScheduler(tf.keras.optimizers.schedules.LearningRateSchedule): |
|
def __init__(self, max_lr, warmup_steps=5000): |
|
super().__init__() |
|
self.max_lr = tf.cast(max_lr, tf.float32) |
|
self.warmup_steps = warmup_steps |
|
|
|
def __call__(self, step): |
|
step = tf.cast(step, tf.float32) |
|
condition = tf.cond(step < self.warmup_steps, lambda: step / self.warmup_steps, lambda: 1.0) |
|
return self.max_lr * condition |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
num_data = 0 |
|
dataset = load_dataset("carlesoctav/en-id-parallel-sentences-embedding") |
|
|
|
dataset_1 = dataset["combinedtech"] |
|
|
|
for split in dataset: |
|
dataset_1 = concatenate_datasets([dataset_1, dataset[split]]) |
|
|
|
|
|
batch_size = 512 |
|
dataset = dataset_1.train_test_split(test_size=0.005, shuffle=True) |
|
train_dataset = dataset["train"] |
|
val_dataset = dataset["test"] |
|
print(val_dataset.shape) |
|
|
|
train_dataset = train_dataset.to_tf_dataset( |
|
columns=["input_ids_en", "attention_mask_en", "input_ids_id", "attention_mask_id"], |
|
label_cols="target_embedding", |
|
batch_size=batch_size, |
|
).unbatch() |
|
|
|
val_dataset = val_dataset.to_tf_dataset( |
|
columns=["input_ids_en", "attention_mask_en", "input_ids_id", "attention_mask_id"], |
|
label_cols="target_embedding", |
|
batch_size=batch_size, |
|
).unbatch() |
|
|
|
|
|
print(train_dataset.element_spec) |
|
print(val_dataset.element_spec) |
|
|
|
train_dataset = train_dataset.batch(batch_size, drop_remainder=True).cache() |
|
val_dataset = val_dataset.batch(batch_size, drop_remainder=True).cache() |
|
|
|
|
|
learning_rate = ConstantScheduler(1e-3, warmup_steps=10000) |
|
|
|
|
|
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, |
|
epsilon=1e-9) |
|
|
|
|
|
|
|
loss = tf.keras.losses.MeanSquaredError() |
|
|
|
date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") |
|
output_path = here(f"disk/model/{date_time}/multiqa-Mmini-L6-H384.h5") |
|
|
|
model_checkpoint = tf.keras.callbacks.ModelCheckpoint( |
|
filepath = output_path, |
|
save_weights_only = True, |
|
monitor = "val_avg_acc", |
|
mode = 'auto', |
|
verbose = 1, |
|
save_best_only = True, |
|
initial_value_threshold = 0.5, |
|
) |
|
|
|
early_stopping = tf.keras.callbacks.EarlyStopping( |
|
monitor = "val_avg_acc", |
|
mode = 'auto', |
|
restore_best_weights=False, |
|
patience = 4, |
|
verbose=1, |
|
start_from_epoch = 5, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
csv_logger = tf.keras.callbacks.CSVLogger( |
|
filename = here(f"disk/performance_logs/log-{date_time}.csv"), |
|
separator = ",", |
|
append = False |
|
) |
|
|
|
|
|
|
|
|
|
callbacks = [sentence_translation_metric(), model_checkpoint, csv_logger,early_stopping] |
|
|
|
|
|
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver("local") |
|
tf.config.experimental_connect_to_cluster(cluster_resolver) |
|
tf.tpu.experimental.initialize_tpu_system(cluster_resolver) |
|
strategy = tf.distribute.TPUStrategy(cluster_resolver) |
|
|
|
with strategy.scope(): |
|
student_model = create_model() |
|
student_model.compile(optimizer=optimizer, loss=loss) |
|
|
|
student_model.fit(train_dataset, epochs=20, validation_data=val_dataset, callbacks=callbacks) |
|
|
|
|
|
last_epoch_save = here(f"disk/model/last_epoch/{date_time}.h5") |
|
student_model.save_weights(last_epoch_save) |
|
|
|
|
|
|
|
|