Spaces:
Running
Running
# %% | |
# Set logging level to DEBUG | |
import logging | |
import os | |
import dotenv | |
import numpy as np | |
import pandas as pd | |
import pyalex | |
import rdflib | |
from ampligraph.datasets import ( | |
GraphDataLoader, | |
SQLiteAdapter, | |
DataSourceIdentifier, | |
) | |
from ampligraph.datasets.graph_partitioner import NaiveGraphPartitioner, BucketGraphPartitioner | |
from ampligraph.evaluation import train_test_split_no_unseen | |
from ampligraph.latent_features import ScoringBasedEmbeddingModel | |
from pyalex import Authors, Concepts, Funders, Institutions, Publishers, Sources, Works | |
from sklearn.model_selection import train_test_split | |
import tensorflow as tf | |
from ampligraph.evaluation import hits_at_n_score, mrr_score | |
from ampligraph.latent_features import ScoringBasedEmbeddingModel | |
from ampligraph.latent_features.loss_functions import get as get_loss | |
from ampligraph.latent_features.regularizers import get as get_regularizer | |
logging.basicConfig(level=logging.DEBUG) | |
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] | |
for logger in loggers: | |
logger.setLevel(logging.INFO) | |
# Load the triples from the file | |
X_train = np.load("train.npy") | |
X_valid = np.load("valid.npy") | |
X_test = np.load("test.npy") | |
## Store as CSVs. There are commas in the names of some institutions, so we need to use a tab as the delimiter | |
#np.savetxt("train.csv", X_train, delimiter="\t", fmt="%s") | |
#np.savetxt("valid.csv", X_valid, delimiter="\t", fmt="%s") | |
#np.savetxt("test.csv", X_test, delimiter="\t", fmt="%s") | |
# | |
#print(f"Train size: {X_train.shape[0]}") | |
#print(f"Valid size: {X_valid.shape[0]}") | |
#print(f"Test size: {X_test.shape[0]}") | |
# Initialize a ComplEx neural embedding model: the embedding size is k, | |
# eta specifies the number of corruptions to generate per each positive, | |
# scoring_type determines the scoring function of the embedding model. | |
partitioned_model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx") | |
# Optimizer, loss and regularizer definition | |
optim = tf.keras.optimizers.Adam(learning_rate=1e-3) | |
loss = get_loss("pairwise", {"margin": 0.5}) | |
regularizer = get_regularizer("LP", {"p": 2, "lambda": 1e-5}) | |
# Compilation of the model | |
partitioned_model.compile( | |
optimizer=optim, loss=loss, entity_relation_regularizer=regularizer | |
) | |
# For evaluation, we can use a filter which would be used to filter out | |
# positives statements created by the corruption procedure. | |
# Here we define the filter set by concatenating all the positives | |
filter = {"test": np.concatenate((X_train, X_valid, X_test))} | |
# Early Stopping callback | |
checkpoint = tf.keras.callbacks.EarlyStopping( | |
monitor="val_{}".format("hits10"), | |
min_delta=0, | |
patience=5, | |
verbose=1, | |
mode="max", | |
restore_best_weights=True, | |
) | |
###### | |
use_db = False | |
if use_db: | |
AMPLIGRAPH_DATA_HOME = os.path.join(os.getcwd(), "data") # + os.sep | |
from ampligraph.datasets.data_indexer import SQLite as SQLiteIndexer, DataIndexer | |
# Initialize GraphDataLoader from .csv file | |
sqlite_indexer = SQLiteIndexer( | |
data=None, | |
db_file="main_partition.db", | |
root_directory=AMPLIGRAPH_DATA_HOME, | |
name="main_partition", | |
) | |
indexer = DataIndexer( | |
X=None, | |
backend_type='sqlite', | |
backend=sqlite_indexer, | |
) | |
dataset_loader = GraphDataLoader( | |
"train.csv", | |
backend=SQLiteAdapter, | |
in_memory=False, | |
verbose=True, | |
root_directory=AMPLIGRAPH_DATA_HOME, | |
db_name="mydb.db", | |
use_indexer=indexer, | |
) | |
# adapter = SQLiteAdapter( | |
# "database_25-12-2023_07-28-41_485047_PM_2a11fc49-2337-415e-8672-2bfa48a83745.db", | |
# identifier=DataSourceIdentifier, | |
# root_directory=AMPLIGRAPH_DATA_HOME, | |
# ) | |
print("Graph data loader initialized") | |
# for elem in next(dataset_loader._get_batch_generator()): | |
# print(elem) | |
# break | |
###### | |
else: | |
X_train = np.load("train.npy") | |
dataset_loader = GraphDataLoader( | |
X_train, | |
verbose=True, | |
use_indexer=True, | |
in_memory=True, | |
) | |
print(f'next: {next(dataset_loader)}') | |
print(f'next: {next(dataset_loader)}') | |
print(f'next: {next(dataset_loader)}') | |
#x = np.loadtxt( | |
# "train.csv", | |
# delimiter="\t", | |
# dtype=str, | |
#) | |
#print(x[0]) | |
# Choose the partitioner - in this case we choose RandomEdges partitioner | |
partition = False | |
if partition: | |
print("Will start partitioning now") | |
graph_partitioner_train = NaiveGraphPartitioner(dataset_loader, k=6) | |
print("Graph partitioner initialized") | |
#indexer = ( | |
# partitioned_model.data_handler.get_mapper() | |
#) # get the mapper from the trained model | |
# dataset_loader_test = GraphDataLoader( | |
# data_source=X_test, | |
# backend=SQLiteAdapter, # type of backend to use | |
# batch_size=400, # batch size to use while iterating over this dataset | |
# dataset_type="test", # dataset type | |
# use_indexer=indexer, # mapper to map test concepts to the same indices used during training | |
# verbose=True, | |
# ) | |
# graph_partitioner_test = BucketGraphPartitioner(data=partitioner, k=3) | |
print("Will start training now") | |
# Fit the model on training and validation set | |
partitioned_model.fit( | |
#graph_partitioner_train if partition else dataset_loader, | |
X_train, | |
batch_size=500, | |
epochs=45, # Number of training epochs | |
validation_freq=20, # Epochs between successive validation | |
validation_burn_in=100, # Epoch to start validation | |
validation_data=X_test, # Validation data | |
validation_filter=filter, # Filter positives from validation corruptions | |
callbacks=[ | |
checkpoint | |
], # Early stopping callback (more from tf.keras.callbacks are supported) | |
verbose=True, # Enable stdout messages | |
#partitioning_k=7, # Number of partitions to create | |
) | |
# %% | |
# Store the model | |
super(ScoringBasedEmbeddingModel, partitioned_model).save_weights("model/") | |
partitioned_model.save_metadata(filedir="model") | |
# from ampligraph.utils import save_model | |
# save_model(partitioned_model, model_name_path='model.pkl') | |
# %% | |
# Create a dataframe of the institutions and their names | |
import pandas as pd | |
import rdflib | |
g = rdflib.Graph() | |
uri = "urn:acmcmc:unis:" | |
unis = rdflib.Namespace(uri) | |
g.bind("unis", unis) | |
g.parse("universities_large_1200.ttl", format="turtle") | |
query_results = g.query( | |
""" | |
SELECT DISTINCT ?institution ?name | |
WHERE { | |
?institution a unis:Institution . | |
?institution unis:name ?name . | |
} | |
""" | |
) | |
institutions = pd.DataFrame(query_results, columns=["institution", "name"]) | |
institutions["institution"] = institutions["institution"].apply(lambda x: str(x)) | |
institutions["name"] = institutions["name"].apply(lambda x: str(x)) | |
# Store the dataframe | |
institutions.to_csv("institutions.csv", index=False) | |
# %% | |
# Run the evaluation procedure on the test set (with filtering) | |
# To disable filtering: use_filter=None | |
# Usually, we corrupt subject and object sides separately and compute ranks | |
ranks = partitioned_model.evaluate(X_test, use_filter=filter, corrupt_side="s,o") | |
# compute and print metrics: | |
mrr = mrr_score(ranks) | |
hits_10 = hits_at_n_score(ranks, n=10) | |
print("MRR: %f, Hits@10: %f" % (mrr, hits_10)) | |