aldan.creo
First commit
cdd672b
raw
history blame contribute delete
No virus
7.23 kB
# %%
# Set logging level to DEBUG
import logging
import os
import dotenv
import numpy as np
import pandas as pd
import pyalex
import rdflib
from ampligraph.datasets import (
GraphDataLoader,
SQLiteAdapter,
DataSourceIdentifier,
)
from ampligraph.datasets.graph_partitioner import NaiveGraphPartitioner, BucketGraphPartitioner
from ampligraph.evaluation import train_test_split_no_unseen
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from pyalex import Authors, Concepts, Funders, Institutions, Publishers, Sources, Works
from sklearn.model_selection import train_test_split
import tensorflow as tf
from ampligraph.evaluation import hits_at_n_score, mrr_score
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
logging.basicConfig(level=logging.DEBUG)
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
logger.setLevel(logging.INFO)
# Load the triples from the file
X_train = np.load("train.npy")
X_valid = np.load("valid.npy")
X_test = np.load("test.npy")
## Store as CSVs. There are commas in the names of some institutions, so we need to use a tab as the delimiter
#np.savetxt("train.csv", X_train, delimiter="\t", fmt="%s")
#np.savetxt("valid.csv", X_valid, delimiter="\t", fmt="%s")
#np.savetxt("test.csv", X_test, delimiter="\t", fmt="%s")
#
#print(f"Train size: {X_train.shape[0]}")
#print(f"Valid size: {X_valid.shape[0]}")
#print(f"Test size: {X_test.shape[0]}")
# Initialize a ComplEx neural embedding model: the embedding size is k,
# eta specifies the number of corruptions to generate per each positive,
# scoring_type determines the scoring function of the embedding model.
partitioned_model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx")
# Optimizer, loss and regularizer definition
optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = get_loss("pairwise", {"margin": 0.5})
regularizer = get_regularizer("LP", {"p": 2, "lambda": 1e-5})
# Compilation of the model
partitioned_model.compile(
optimizer=optim, loss=loss, entity_relation_regularizer=regularizer
)
# For evaluation, we can use a filter which would be used to filter out
# positives statements created by the corruption procedure.
# Here we define the filter set by concatenating all the positives
filter = {"test": np.concatenate((X_train, X_valid, X_test))}
# Early Stopping callback
checkpoint = tf.keras.callbacks.EarlyStopping(
monitor="val_{}".format("hits10"),
min_delta=0,
patience=5,
verbose=1,
mode="max",
restore_best_weights=True,
)
######
use_db = False
if use_db:
AMPLIGRAPH_DATA_HOME = os.path.join(os.getcwd(), "data") # + os.sep
from ampligraph.datasets.data_indexer import SQLite as SQLiteIndexer, DataIndexer
# Initialize GraphDataLoader from .csv file
sqlite_indexer = SQLiteIndexer(
data=None,
db_file="main_partition.db",
root_directory=AMPLIGRAPH_DATA_HOME,
name="main_partition",
)
indexer = DataIndexer(
X=None,
backend_type='sqlite',
backend=sqlite_indexer,
)
dataset_loader = GraphDataLoader(
"train.csv",
backend=SQLiteAdapter,
in_memory=False,
verbose=True,
root_directory=AMPLIGRAPH_DATA_HOME,
db_name="mydb.db",
use_indexer=indexer,
)
# adapter = SQLiteAdapter(
# "database_25-12-2023_07-28-41_485047_PM_2a11fc49-2337-415e-8672-2bfa48a83745.db",
# identifier=DataSourceIdentifier,
# root_directory=AMPLIGRAPH_DATA_HOME,
# )
print("Graph data loader initialized")
# for elem in next(dataset_loader._get_batch_generator()):
# print(elem)
# break
######
else:
X_train = np.load("train.npy")
dataset_loader = GraphDataLoader(
X_train,
verbose=True,
use_indexer=True,
in_memory=True,
)
print(f'next: {next(dataset_loader)}')
print(f'next: {next(dataset_loader)}')
print(f'next: {next(dataset_loader)}')
#x = np.loadtxt(
# "train.csv",
# delimiter="\t",
# dtype=str,
#)
#print(x[0])
# Choose the partitioner - in this case we choose RandomEdges partitioner
partition = False
if partition:
print("Will start partitioning now")
graph_partitioner_train = NaiveGraphPartitioner(dataset_loader, k=6)
print("Graph partitioner initialized")
#indexer = (
# partitioned_model.data_handler.get_mapper()
#) # get the mapper from the trained model
# dataset_loader_test = GraphDataLoader(
# data_source=X_test,
# backend=SQLiteAdapter, # type of backend to use
# batch_size=400, # batch size to use while iterating over this dataset
# dataset_type="test", # dataset type
# use_indexer=indexer, # mapper to map test concepts to the same indices used during training
# verbose=True,
# )
# graph_partitioner_test = BucketGraphPartitioner(data=partitioner, k=3)
print("Will start training now")
# Fit the model on training and validation set
partitioned_model.fit(
#graph_partitioner_train if partition else dataset_loader,
X_train,
batch_size=500,
epochs=45, # Number of training epochs
validation_freq=20, # Epochs between successive validation
validation_burn_in=100, # Epoch to start validation
validation_data=X_test, # Validation data
validation_filter=filter, # Filter positives from validation corruptions
callbacks=[
checkpoint
], # Early stopping callback (more from tf.keras.callbacks are supported)
verbose=True, # Enable stdout messages
#partitioning_k=7, # Number of partitions to create
)
# %%
# Store the model
super(ScoringBasedEmbeddingModel, partitioned_model).save_weights("model/")
partitioned_model.save_metadata(filedir="model")
# from ampligraph.utils import save_model
# save_model(partitioned_model, model_name_path='model.pkl')
# %%
# Create a dataframe of the institutions and their names
import pandas as pd
import rdflib
g = rdflib.Graph()
uri = "urn:acmcmc:unis:"
unis = rdflib.Namespace(uri)
g.bind("unis", unis)
g.parse("universities_large_1200.ttl", format="turtle")
query_results = g.query(
"""
SELECT DISTINCT ?institution ?name
WHERE {
?institution a unis:Institution .
?institution unis:name ?name .
}
"""
)
institutions = pd.DataFrame(query_results, columns=["institution", "name"])
institutions["institution"] = institutions["institution"].apply(lambda x: str(x))
institutions["name"] = institutions["name"].apply(lambda x: str(x))
# Store the dataframe
institutions.to_csv("institutions.csv", index=False)
# %%
# Run the evaluation procedure on the test set (with filtering)
# To disable filtering: use_filter=None
# Usually, we corrupt subject and object sides separately and compute ranks
ranks = partitioned_model.evaluate(X_test, use_filter=filter, corrupt_side="s,o")
# compute and print metrics:
mrr = mrr_score(ranks)
hits_10 = hits_at_n_score(ranks, n=10)
print("MRR: %f, Hits@10: %f" % (mrr, hits_10))