{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pyalex\n", "import dotenv\n", "import os\n", "from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "from ampligraph.evaluation import train_test_split_no_unseen\n", "\n", "dotenv.load_dotenv()\n", "\n", "pyalex.config.email = os.getenv(\"MY_EMAIL\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "knowledge_graphs = Concepts().search(\"knowledge graph\").count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import rdflib\n", "\n", "g = rdflib.Graph()\n", "uri = \"urn:acmcmc:unis:\"\n", "unis = rdflib.Namespace(uri)\n", "g.bind(\"unis\", unis)\n", "# g.parse(\"universities_large_1200.ttl\", format=\"turtle\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def store_graph():\n", " g.serialize(destination='universities_large.ttl', format='turtle')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "articles = (\n", " Works()\n", " .search_filter(abstract=\"Large Language Model Knowledge Graph\")\n", " .filter(authorships={\"institutions\": {\"country_code\": \"US\", \"type\": \"education\"}})\n", ")\n", "articles = Works().filter(\n", " concepts={\"id\": \"C2987255567|C204321447|C41008148\"},\n", " # C2987255567: Knowledge Graph\n", " # C204321447: Natural Language Processing\n", " # C41008148 : Computer Science\n", " authorships={\"institutions\": {\"country_code\": \"US\", \"type\": \"education\"}},\n", ").sort(publication_date=\"desc\")\n", "print(f\"Found {articles.count()} articles. Fetching...\")\n", "\n", "if articles.count() > 1000:\n", " print(\"Too many articles. Loading from file.\")\n", " g.parse(\"universities_large_1200.ttl\", format=\"turtle\")\n", "else:\n", " all_articles = []\n", " num_articles_concepts = {\n", " \"https://openalex.org/C2987255567\": 0,\n", " \"https://openalex.org/C204321447\": 0,\n", " \"https://openalex.org/C41008148\": 0,\n", " }\n", " # Go through all pages\n", " paginator = articles.paginate(per_page=200, n_max=1000000)\n", " for i, page in enumerate(paginator):\n", " print(f\"Processing page {i}\")\n", " if i > 0 and i % 100 == 0:\n", " store_graph()\n", " for article in page:\n", " all_articles.append(article)\n", " article_uri = rdflib.URIRef(article[\"id\"])\n", " g.add((article_uri, rdflib.RDF.type, unis.Article))\n", " g.add((article_uri, unis.title, rdflib.Literal(article[\"title\"])))\n", " # Related to is a list of ids\n", " for related_to in article[\"related_works\"]:\n", " g.add((article_uri, unis.related_to, rdflib.URIRef(related_to)))\n", " for reference in article[\"referenced_works\"]:\n", " g.add((article_uri, unis.references, rdflib.URIRef(reference)))\n", " # Authors is a list of dicts\n", " for author in article[\"authorships\"]:\n", " author_uri = rdflib.URIRef(author[\"author\"][\"id\"])\n", " g.add((author_uri, rdflib.RDF.type, unis.Author))\n", " g.add(\n", " (\n", " author_uri,\n", " unis.name,\n", " rdflib.Literal(author[\"author\"][\"display_name\"]),\n", " )\n", " )\n", " g.add((article_uri, unis.has_author, author_uri))\n", " for institution in author[\"institutions\"]:\n", " institution_uri = rdflib.URIRef(institution[\"id\"])\n", " g.add((institution_uri, rdflib.RDF.type, unis.Institution))\n", " # g.add((author_uri, unis.affiliated_to, institution_uri)) # Do not add this, because the author might be affiliated to multiple institutions at different times\n", " g.add(\n", " (\n", " article_uri,\n", " unis.written_in_institution,\n", " institution_uri,\n", " )\n", " )\n", " g.add(\n", " (\n", " institution_uri,\n", " unis.country,\n", " rdflib.Literal(institution[\"country_code\"]),\n", " )\n", " )\n", " g.add(\n", " (\n", " institution_uri,\n", " unis.name,\n", " rdflib.Literal(institution[\"display_name\"]),\n", " )\n", " )\n", " for parent_institution_id in institution[\"lineage\"]:\n", " parent_institution_uri = rdflib.URIRef(parent_institution_id)\n", " g.add(\n", " (parent_institution_uri, rdflib.RDF.type, unis.Institution)\n", " )\n", " g.add(\n", " (institution_uri, unis.is_part_of, parent_institution_uri)\n", " )\n", " # Concepts is a list of dicts\n", " for concept in [c for c in article[\"concepts\"] if c[\"score\"] > 0.4]:\n", " concept_uri = rdflib.URIRef(concept[\"id\"])\n", " g.add((concept_uri, rdflib.RDF.type, unis.Concept))\n", " g.add(\n", " (\n", " institution_uri,\n", " unis.institution_related_to_concept,\n", " concept_uri,\n", " )\n", " )\n", " # Count the concepts\n", " if concept[\"id\"] in num_articles_concepts:\n", " num_articles_concepts[concept[\"id\"]] += 1\n", " # Concepts is a list of dicts\n", " for concept in [c for c in article[\"concepts\"] if c[\"score\"] > 0.4]:\n", " concept_uri = rdflib.URIRef(concept[\"id\"])\n", " g.add((concept_uri, rdflib.RDF.type, unis.Concept))\n", " g.add((article_uri, unis.related_to_concept, concept_uri))\n", " g.add((concept_uri, unis.name, rdflib.Literal(concept[\"display_name\"])))\n", " # print the numbers of articles per concept\n", " print(num_articles_concepts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Knogledge Extraction rule: if we have institution I, a paper P, and P is related to concept C, then C is related to I\n", "# Add triples to the graph for this rule\n", "query_results = g.query(\n", " \"\"\"\n", " SELECT DISTINCT ?institution ?concept\n", " WHERE {\n", " ?institution a unis:Institution .\n", " ?article a unis:Article .\n", " ?concept a unis:Concept .\n", " ?article unis:written_in_institution ?institution .\n", " ?article unis:related_to_concept ?concept .\n", " }\n", " \"\"\"\n", ")\n", "# Print the number of results\n", "print(f\"Found {len(query_results)} results for the rule.\")\n", "for i, row in enumerate(query_results):\n", " if i % 1000 == 0:\n", " print(f\"Processing rule {i}\")\n", " g.add((row[0], unis.institution_related_to_concept, row[1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "results = Works().search_filter(abstract=\"Large Language Model Knowledge Graph\").group_by(\n", " \"authorships.institutions.id\"\n", ")\n", "\n", "print(f\"Found {results.count()} articles. Fetching...\")\n", "\n", "df = pd.DataFrame(results.get())\n", "\n", "display(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "store_graph()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph\n", "#import networkx as nx\n", "#import matplotlib.pyplot as plt\n", "#\n", "#G = rdflib_to_networkx_multidigraph(g)\n", "#\n", "## Plot Networkx instance of RDF Graph\n", "#pos = nx.spring_layout(G, scale=0.1)\n", "#edge_labels = nx.get_edge_attributes(G, \"r\")\n", "#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)\n", "#nx.draw(G, with_labels=True)\n", "#\n", "## if not in interactive mode for\n", "#plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# Get the triples from the graph to a numpy array\n", "# Array of size (n_triples, 3)\n", "# We just want the triples where the predicate is either:\n", "# - related_to\n", "# - has_author\n", "# - written_in_institution\n", "# - related_to_concept\n", "# - references\n", "# - is_part_of\n", "triples_generator = list(g.triples((None, unis.related_to, None)))\n", "triples_generator += list(g.triples((None, unis.has_author, None)))\n", "triples_generator += list(g.triples((None, unis.written_in_institution, None)))\n", "triples_generator += list(g.triples((None, unis.related_to_concept, None)))\n", "triples_generator += list(g.triples((None, unis.institution_related_to_concept, None)))\n", "triples_generator += list(g.triples((None, unis.references, None)))\n", "triples_generator += list(g.triples((None, unis.is_part_of, None)))\n", "triples = np.array(\n", " [(str(s), str(p), str(o)) for s, p, o in triples_generator]\n", ") # (subject, predicate, object) triples\n", "\n", "# Convert the objects to their string representation\n", "# Split the triples into train, valid, and test sets (80%, 10%, 10%)\n", "X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=0.2)\n", "X_valid, X_test = train_test_split_no_unseen(X_valid, test_size=0.5, allow_duplication=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Store the triples in a file\n", "np.save(\"train.npy\", X_train)\n", "np.save(\"valid.npy\", X_valid)\n", "np.save(\"test.npy\", X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the triples from the file\n", "X_train = np.load(\"train.npy\")\n", "X_valid = np.load(\"valid.npy\")\n", "X_test = np.load(\"test.npy\")\n", "\n", "print(f\"Train size: {X_train.shape[0]}\")\n", "print(f\"Valid size: {X_valid.shape[0]}\")\n", "print(f\"Test size: {X_test.shape[0]}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Run the evaluation procedure on the test set (with filtering)\n", "# To disable filtering: use_filter=None\n", "# Usually, we corrupt subject and object sides separately and compute ranks\n", "ranks = model.evaluate(X_test, use_filter=filter, corrupt_side=\"s,o\")\n", "\n", "# compute and print metrics:\n", "mrr = mrr_score(ranks)\n", "hits_10 = hits_at_n_score(ranks, n=10)\n", "print(\"MRR: %f, Hits@10: %f\" % (mrr, hits_10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Store the model\n", "super(ScoringBasedEmbeddingModel, model).save_weights(\"model/\")\n", "model.save_metadata(filedir='model')\n", "#from ampligraph.utils import save_model\n", "#save_model(model, model_name_path='model.pkl')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Generate the embeddings for entities and relations in the graph\n", "# and store them in numpy arrays\n", "all_ids_institutions = np.array(\n", " [\n", " (str(x), str(name), int(num_articles))\n", " for (x, name, num_articles) in g.query(\n", " \"\"\"SELECT DISTINCT ?s ?name (COUNT (?article) AS ?num_articles)\n", " WHERE {\n", " ?s a .\n", " ?s ?name .\n", " ?article ?s .\n", " ?article ?related_to \n", " }\n", " GROUP BY ?s ?name\n", " \"\"\"\n", " )\n", " ]\n", ")\n", "print(all_ids_institutions.shape)\n", "print(all_ids_institutions[0])\n", "entity_embeddings = model.get_embeddings(entities=all_ids_institutions[:, 0])\n", "display(entity_embeddings.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# PCA\n", "from sklearn.decomposition import PCA\n", "pca = PCA(n_components=2)\n", "pca.fit(entity_embeddings)\n", "entity_embeddings_pca = pca.transform(entity_embeddings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from ampligraph.discovery import find_clusters\n", "from sklearn.cluster import KMeans\n", "\n", "clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)\n", "clusters = find_clusters(all_ids_institutions[:,0], model, clustering_algorithm, mode=\"e\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_df = pd.DataFrame(\n", " {\n", " \"institution\": all_ids_institutions[:, 0],\n", " \"institution_name\": all_ids_institutions[:, 1],\n", " \"embedding1\": entity_embeddings_pca[:, 0],\n", " \"embedding2\": entity_embeddings_pca[:, 1],\n", " \"cluster\": \"cluster\" + pd.Series(clusters).astype(str),\n", " \"num_articles\": all_ids_institutions[:, 2].astype(int),\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from adjustText import adjust_text\n", "\n", "\n", "def plot_clusters(parameter):\n", " np.random.seed(0)\n", " plt.figure(figsize=(12, 12))\n", " plt.title(\"{} embeddings\".format(parameter).capitalize())\n", " ax = sns.scatterplot(\n", " data=plot_df,\n", " x=\"embedding1\",\n", " y=\"embedding2\",\n", " hue=parameter,\n", " )\n", " texts = []\n", " for i, point in plot_df.iterrows():\n", " if point[\"institution\"] in [\"https://openalex.org/I161318765\", 'https://openalex.org/I1174212', 'https://openalex.org/I95457486']:\n", " print(point)\n", " texts.append(\n", " plt.text(\n", " point[\"embedding1\"] + 0.02,\n", " point[\"embedding2\"] + 0.01,\n", " str(point[\"institution_name\"]),\n", " )\n", " )\n", " # texts.append(\n", " # plt.text(\n", " # point[\"embedding1\"] + 0.02,\n", " # point[\"embedding2\"] + 0.01,\n", " # str(point[\"institutions\"]),\n", " # )\n", " # )\n", " adjust_text(texts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_clusters(\"num_articles\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from ampligraph.discovery import discover_facts\n", "\n", "discover_facts(\n", " filter['test'],\n", " model,\n", " top_n=100,\n", " strategy=\"random_uniform\",\n", " max_candidates=100,\n", " target_rel=\"urn:acmcmc:unis:related_to_concept\",\n", " seed=0,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a dataframe of the institutions and their names\n", "import pandas as pd\n", "query_results = g.query(\n", " \"\"\"\n", " SELECT DISTINCT ?institution ?name\n", " WHERE {\n", " ?institution a unis:Institution .\n", " ?institution unis:name ?name .\n", " }\n", " \"\"\"\n", ")\n", "institutions = pd.DataFrame(query_results, columns=[\"institution\", \"name\"])\n", "institutions[\"institution\"] = institutions[\"institution\"].apply(lambda x: str(x))\n", "institutions[\"name\"] = institutions[\"name\"].apply(lambda x: str(x))\n", "# Store the dataframe\n", "institutions.to_csv(\"institutions.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "universities-kge", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }