In [None]:
import pyalex
import dotenv
import os
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from ampligraph.evaluation import train_test_split_no_unseen

dotenv.load_dotenv()

pyalex.config.email = os.getenv("MY_EMAIL")

In [None]:
knowledge_graphs = Concepts().search("knowledge graph").count()

In [None]:
import rdflib

g = rdflib.Graph()
uri = "urn:acmcmc:unis:"
unis = rdflib.Namespace(uri)
g.bind("unis", unis)
# g.parse("universities_large_1200.ttl", format="turtle")

In [None]:
def store_graph():
 g.serialize(destination='universities_large.ttl', format='turtle')

In [None]:
articles = (
 Works()
 .search_filter(abstract="Large Language Model Knowledge Graph")
 .filter(authorships={"institutions": {"country_code": "US", "type": "education"}})
)
articles = Works().filter(
 concepts={"id": "C2987255567|C204321447|C41008148"},
 # C2987255567: Knowledge Graph
 # C204321447: Natural Language Processing
 # C41008148 : Computer Science
 authorships={"institutions": {"country_code": "US", "type": "education"}},
).sort(publication_date="desc")
print(f"Found {articles.count()} articles. Fetching...")

if articles.count() > 1000:
 print("Too many articles. Loading from file.")
 g.parse("universities_large_1200.ttl", format="turtle")
else:
 all_articles = []
 num_articles_concepts = {
 "https://openalex.org/C2987255567": 0,
 "https://openalex.org/C204321447": 0,
 "https://openalex.org/C41008148": 0,
 }
 # Go through all pages
 paginator = articles.paginate(per_page=200, n_max=1000000)
 for i, page in enumerate(paginator):
 print(f"Processing page {i}")
 if i > 0 and i % 100 == 0:
 store_graph()
 for article in page:
 all_articles.append(article)
 article_uri = rdflib.URIRef(article["id"])
 g.add((article_uri, rdflib.RDF.type, unis.Article))
 g.add((article_uri, unis.title, rdflib.Literal(article["title"])))
 # Related to is a list of ids
 for related_to in article["related_works"]:
 g.add((article_uri, unis.related_to, rdflib.URIRef(related_to)))
 for reference in article["referenced_works"]:
 g.add((article_uri, unis.references, rdflib.URIRef(reference)))
 # Authors is a list of dicts
 for author in article["authorships"]:
 author_uri = rdflib.URIRef(author["author"]["id"])
 g.add((author_uri, rdflib.RDF.type, unis.Author))
 g.add(
 (
 author_uri,
 unis.name,
 rdflib.Literal(author["author"]["display_name"]),
 )
 )
 g.add((article_uri, unis.has_author, author_uri))
 for institution in author["institutions"]:
 institution_uri = rdflib.URIRef(institution["id"])
 g.add((institution_uri, rdflib.RDF.type, unis.Institution))
 # g.add((author_uri, unis.affiliated_to, institution_uri)) # Do not add this, because the author might be affiliated to multiple institutions at different times
 g.add(
 (
 article_uri,
 unis.written_in_institution,
 institution_uri,
 )
 )
 g.add(
 (
 institution_uri,
 unis.country,
 rdflib.Literal(institution["country_code"]),
 )
 )
 g.add(
 (
 institution_uri,
 unis.name,
 rdflib.Literal(institution["display_name"]),
 )
 )
 for parent_institution_id in institution["lineage"]:
 parent_institution_uri = rdflib.URIRef(parent_institution_id)
 g.add(
 (parent_institution_uri, rdflib.RDF.type, unis.Institution)
 )
 g.add(
 (institution_uri, unis.is_part_of, parent_institution_uri)
 )
 # Concepts is a list of dicts
 for concept in [c for c in article["concepts"] if c["score"] > 0.4]:
 concept_uri = rdflib.URIRef(concept["id"])
 g.add((concept_uri, rdflib.RDF.type, unis.Concept))
 g.add(
 (
 institution_uri,
 unis.institution_related_to_concept,
 concept_uri,
 )
 )
 # Count the concepts
 if concept["id"] in num_articles_concepts:
 num_articles_concepts[concept["id"]] += 1
 # Concepts is a list of dicts
 for concept in [c for c in article["concepts"] if c["score"] > 0.4]:
 concept_uri = rdflib.URIRef(concept["id"])
 g.add((concept_uri, rdflib.RDF.type, unis.Concept))
 g.add((article_uri, unis.related_to_concept, concept_uri))
 g.add((concept_uri, unis.name, rdflib.Literal(concept["display_name"])))
 # print the numbers of articles per concept
 print(num_articles_concepts)

In [None]:
# Knogledge Extraction rule: if we have institution I, a paper P, and P is related to concept C, then C is related to I
# Add triples to the graph for this rule
query_results = g.query(
 """
 SELECT DISTINCT ?institution ?concept
 WHERE {
 ?institution a unis:Institution .
 ?article a unis:Article .
 ?concept a unis:Concept .
 ?article unis:written_in_institution ?institution .
 ?article unis:related_to_concept ?concept .
 }
 """
)
# Print the number of results
print(f"Found {len(query_results)} results for the rule.")
for i, row in enumerate(query_results):
 if i % 1000 == 0:
 print(f"Processing rule {i}")
 g.add((row[0], unis.institution_related_to_concept, row[1]))

In [None]:
results = Works().search_filter(abstract="Large Language Model Knowledge Graph").group_by(
 "authorships.institutions.id"
)

print(f"Found {results.count()} articles. Fetching...")

df = pd.DataFrame(results.get())

display(df)

In [None]:
store_graph()

In [None]:
#from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
#import networkx as nx
#import matplotlib.pyplot as plt
#
#G = rdflib_to_networkx_multidigraph(g)
#
## Plot Networkx instance of RDF Graph
#pos = nx.spring_layout(G, scale=0.1)
#edge_labels = nx.get_edge_attributes(G, "r")
#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
#nx.draw(G, with_labels=True)
#
## if not in interactive mode for
#plt.show()

In [None]:

# Get the triples from the graph to a numpy array
# Array of size (n_triples, 3)
# We just want the triples where the predicate is either:
# - related_to
# - has_author
# - written_in_institution
# - related_to_concept
# - references
# - is_part_of
triples_generator = list(g.triples((None, unis.related_to, None)))
triples_generator += list(g.triples((None, unis.has_author, None)))
triples_generator += list(g.triples((None, unis.written_in_institution, None)))
triples_generator += list(g.triples((None, unis.related_to_concept, None)))
triples_generator += list(g.triples((None, unis.institution_related_to_concept, None)))
triples_generator += list(g.triples((None, unis.references, None)))
triples_generator += list(g.triples((None, unis.is_part_of, None)))
triples = np.array(
 [(str(s), str(p), str(o)) for s, p, o in triples_generator]
) # (subject, predicate, object) triples

# Convert the objects to their string representation
# Split the triples into train, valid, and test sets (80%, 10%, 10%)
X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=0.2)
X_valid, X_test = train_test_split_no_unseen(X_valid, test_size=0.5, allow_duplication=True)

In [None]:
# Store the triples in a file
np.save("train.npy", X_train)
np.save("valid.npy", X_valid)
np.save("test.npy", X_test)

In [None]:
# Load the triples from the file
X_train = np.load("train.npy")
X_valid = np.load("valid.npy")
X_test = np.load("test.npy")

print(f"Train size: {X_train.shape[0]}")
print(f"Valid size: {X_valid.shape[0]}")
print(f"Test size: {X_test.shape[0]}")

In [None]:
# Run the evaluation procedure on the test set (with filtering)
# To disable filtering: use_filter=None
# Usually, we corrupt subject and object sides separately and compute ranks
ranks = model.evaluate(X_test, use_filter=filter, corrupt_side="s,o")

# compute and print metrics:
mrr = mrr_score(ranks)
hits_10 = hits_at_n_score(ranks, n=10)
print("MRR: %f, Hits@10: %f" % (mrr, hits_10))

In [None]:
# Store the model
super(ScoringBasedEmbeddingModel, model).save_weights("model/")
model.save_metadata(filedir='model')
#from ampligraph.utils import save_model
#save_model(model, model_name_path='model.pkl')

In [None]:
# Generate the embeddings for entities and relations in the graph
# and store them in numpy arrays
all_ids_institutions = np.array(
 [
 (str(x), str(name), int(num_articles))
 for (x, name, num_articles) in g.query(
 """SELECT DISTINCT ?s ?name (COUNT (?article) AS ?num_articles)
 WHERE {
 ?s a .
 ?s ?name .
 ?article ?s .
 ?article ?related_to 
 }
 GROUP BY ?s ?name
 """
 )
 ]
)
print(all_ids_institutions.shape)
print(all_ids_institutions[0])
entity_embeddings = model.get_embeddings(entities=all_ids_institutions[:, 0])
display(entity_embeddings.shape)

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(entity_embeddings)
entity_embeddings_pca = pca.transform(entity_embeddings)

In [None]:
from ampligraph.discovery import find_clusters
from sklearn.cluster import KMeans

clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
clusters = find_clusters(all_ids_institutions[:,0], model, clustering_algorithm, mode="e")

In [None]:
plot_df = pd.DataFrame(
 {
 "institution": all_ids_institutions[:, 0],
 "institution_name": all_ids_institutions[:, 1],
 "embedding1": entity_embeddings_pca[:, 0],
 "embedding2": entity_embeddings_pca[:, 1],
 "cluster": "cluster" + pd.Series(clusters).astype(str),
 "num_articles": all_ids_institutions[:, 2].astype(int),
 }
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text


def plot_clusters(parameter):
 np.random.seed(0)
 plt.figure(figsize=(12, 12))
 plt.title("{} embeddings".format(parameter).capitalize())
 ax = sns.scatterplot(
 data=plot_df,
 x="embedding1",
 y="embedding2",
 hue=parameter,
 )
 texts = []
 for i, point in plot_df.iterrows():
 if point["institution"] in ["https://openalex.org/I161318765", 'https://openalex.org/I1174212', 'https://openalex.org/I95457486']:
 print(point)
 texts.append(
 plt.text(
 point["embedding1"] + 0.02,
 point["embedding2"] + 0.01,
 str(point["institution_name"]),
 )
 )
 # texts.append(
 # plt.text(
 # point["embedding1"] + 0.02,
 # point["embedding2"] + 0.01,
 # str(point["institutions"]),
 # )
 # )
 adjust_text(texts)

In [None]:
plot_clusters("num_articles")

In [None]:
from ampligraph.discovery import discover_facts

discover_facts(
 filter['test'],
 model,
 top_n=100,
 strategy="random_uniform",
 max_candidates=100,
 target_rel="urn:acmcmc:unis:related_to_concept",
 seed=0,
)

In [None]:
# Create a dataframe of the institutions and their names
import pandas as pd
query_results = g.query(
 """
 SELECT DISTINCT ?institution ?name
 WHERE {
 ?institution a unis:Institution .
 ?institution unis:name ?name .
 }
 """
)
institutions = pd.DataFrame(query_results, columns=["institution", "name"])
institutions["institution"] = institutions["institution"].apply(lambda x: str(x))
institutions["name"] = institutions["name"].apply(lambda x: str(x))
# Store the dataframe
institutions.to_csv("institutions.csv", index=False)