Spaces:

acmc
/

Universities-Explorer

Sleeping

File size: 19,045 Bytes

cdd672b

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyalex\n",
    "import dotenv\n",
    "import os\n",
    "from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "from ampligraph.evaluation import train_test_split_no_unseen\n",
    "\n",
    "dotenv.load_dotenv()\n",
    "\n",
    "pyalex.config.email = os.getenv(\"MY_EMAIL\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "knowledge_graphs = Concepts().search(\"knowledge graph\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import rdflib\n",
    "\n",
    "g = rdflib.Graph()\n",
    "uri = \"urn:acmcmc:unis:\"\n",
    "unis = rdflib.Namespace(uri)\n",
    "g.bind(\"unis\", unis)\n",
    "# g.parse(\"universities_large_1200.ttl\", format=\"turtle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def store_graph():\n",
    "    g.serialize(destination='universities_large.ttl', format='turtle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "articles = (\n",
    "    Works()\n",
    "    .search_filter(abstract=\"Large Language Model Knowledge Graph\")\n",
    "    .filter(authorships={\"institutions\": {\"country_code\": \"US\", \"type\": \"education\"}})\n",
    ")\n",
    "articles = Works().filter(\n",
    "    concepts={\"id\": \"C2987255567|C204321447|C41008148\"},\n",
    "    # C2987255567: Knowledge Graph\n",
    "    # C204321447: Natural Language Processing\n",
    "    # C41008148 : Computer Science\n",
    "    authorships={\"institutions\": {\"country_code\": \"US\", \"type\": \"education\"}},\n",
    ").sort(publication_date=\"desc\")\n",
    "print(f\"Found {articles.count()} articles. Fetching...\")\n",
    "\n",
    "if articles.count() > 1000:\n",
    "    print(\"Too many articles. Loading from file.\")\n",
    "    g.parse(\"universities_large_1200.ttl\", format=\"turtle\")\n",
    "else:\n",
    "    all_articles = []\n",
    "    num_articles_concepts = {\n",
    "        \"https://openalex.org/C2987255567\": 0,\n",
    "        \"https://openalex.org/C204321447\": 0,\n",
    "        \"https://openalex.org/C41008148\": 0,\n",
    "    }\n",
    "    # Go through all pages\n",
    "    paginator = articles.paginate(per_page=200, n_max=1000000)\n",
    "    for i, page in enumerate(paginator):\n",
    "        print(f\"Processing page {i}\")\n",
    "        if i > 0 and i % 100 == 0:\n",
    "            store_graph()\n",
    "        for article in page:\n",
    "            all_articles.append(article)\n",
    "            article_uri = rdflib.URIRef(article[\"id\"])\n",
    "            g.add((article_uri, rdflib.RDF.type, unis.Article))\n",
    "            g.add((article_uri, unis.title, rdflib.Literal(article[\"title\"])))\n",
    "            # Related to is a list of ids\n",
    "            for related_to in article[\"related_works\"]:\n",
    "                g.add((article_uri, unis.related_to, rdflib.URIRef(related_to)))\n",
    "            for reference in article[\"referenced_works\"]:\n",
    "                g.add((article_uri, unis.references, rdflib.URIRef(reference)))\n",
    "            # Authors is a list of dicts\n",
    "            for author in article[\"authorships\"]:\n",
    "                author_uri = rdflib.URIRef(author[\"author\"][\"id\"])\n",
    "                g.add((author_uri, rdflib.RDF.type, unis.Author))\n",
    "                g.add(\n",
    "                    (\n",
    "                        author_uri,\n",
    "                        unis.name,\n",
    "                        rdflib.Literal(author[\"author\"][\"display_name\"]),\n",
    "                    )\n",
    "                )\n",
    "                g.add((article_uri, unis.has_author, author_uri))\n",
    "                for institution in author[\"institutions\"]:\n",
    "                    institution_uri = rdflib.URIRef(institution[\"id\"])\n",
    "                    g.add((institution_uri, rdflib.RDF.type, unis.Institution))\n",
    "                    # g.add((author_uri, unis.affiliated_to, institution_uri)) # Do not add this, because the author might be affiliated to multiple institutions at different times\n",
    "                    g.add(\n",
    "                        (\n",
    "                            article_uri,\n",
    "                            unis.written_in_institution,\n",
    "                            institution_uri,\n",
    "                        )\n",
    "                    )\n",
    "                    g.add(\n",
    "                        (\n",
    "                            institution_uri,\n",
    "                            unis.country,\n",
    "                            rdflib.Literal(institution[\"country_code\"]),\n",
    "                        )\n",
    "                    )\n",
    "                    g.add(\n",
    "                        (\n",
    "                            institution_uri,\n",
    "                            unis.name,\n",
    "                            rdflib.Literal(institution[\"display_name\"]),\n",
    "                        )\n",
    "                    )\n",
    "                    for parent_institution_id in institution[\"lineage\"]:\n",
    "                        parent_institution_uri = rdflib.URIRef(parent_institution_id)\n",
    "                        g.add(\n",
    "                            (parent_institution_uri, rdflib.RDF.type, unis.Institution)\n",
    "                        )\n",
    "                        g.add(\n",
    "                            (institution_uri, unis.is_part_of, parent_institution_uri)\n",
    "                        )\n",
    "                    # Concepts is a list of dicts\n",
    "                    for concept in [c for c in article[\"concepts\"] if c[\"score\"] > 0.4]:\n",
    "                        concept_uri = rdflib.URIRef(concept[\"id\"])\n",
    "                        g.add((concept_uri, rdflib.RDF.type, unis.Concept))\n",
    "                        g.add(\n",
    "                            (\n",
    "                                institution_uri,\n",
    "                                unis.institution_related_to_concept,\n",
    "                                concept_uri,\n",
    "                            )\n",
    "                        )\n",
    "                        # Count the concepts\n",
    "                        if concept[\"id\"] in num_articles_concepts:\n",
    "                            num_articles_concepts[concept[\"id\"]] += 1\n",
    "            # Concepts is a list of dicts\n",
    "            for concept in [c for c in article[\"concepts\"] if c[\"score\"] > 0.4]:\n",
    "                concept_uri = rdflib.URIRef(concept[\"id\"])\n",
    "                g.add((concept_uri, rdflib.RDF.type, unis.Concept))\n",
    "                g.add((article_uri, unis.related_to_concept, concept_uri))\n",
    "                g.add((concept_uri, unis.name, rdflib.Literal(concept[\"display_name\"])))\n",
    "        # print the numbers of articles per concept\n",
    "        print(num_articles_concepts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Knogledge Extraction rule: if we have institution I, a paper P, and P is related to concept C, then C is related to I\n",
    "# Add triples to the graph for this rule\n",
    "query_results = g.query(\n",
    "    \"\"\"\n",
    "    SELECT DISTINCT ?institution ?concept\n",
    "    WHERE {\n",
    "        ?institution a unis:Institution .\n",
    "        ?article a unis:Article .\n",
    "        ?concept a unis:Concept .\n",
    "        ?article unis:written_in_institution ?institution .\n",
    "        ?article unis:related_to_concept ?concept .\n",
    "    }\n",
    "    \"\"\"\n",
    ")\n",
    "# Print the number of results\n",
    "print(f\"Found {len(query_results)} results for the rule.\")\n",
    "for i, row in enumerate(query_results):\n",
    "    if i % 1000 == 0:\n",
    "        print(f\"Processing rule {i}\")\n",
    "    g.add((row[0], unis.institution_related_to_concept, row[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = Works().search_filter(abstract=\"Large Language Model Knowledge Graph\").group_by(\n",
    "    \"authorships.institutions.id\"\n",
    ")\n",
    "\n",
    "print(f\"Found {results.count()} articles. Fetching...\")\n",
    "\n",
    "df = pd.DataFrame(results.get())\n",
    "\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "store_graph()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph\n",
    "#import networkx as nx\n",
    "#import matplotlib.pyplot as plt\n",
    "#\n",
    "#G = rdflib_to_networkx_multidigraph(g)\n",
    "#\n",
    "## Plot Networkx instance of RDF Graph\n",
    "#pos = nx.spring_layout(G, scale=0.1)\n",
    "#edge_labels = nx.get_edge_attributes(G, \"r\")\n",
    "#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)\n",
    "#nx.draw(G, with_labels=True)\n",
    "#\n",
    "## if not in interactive mode for\n",
    "#plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Get the triples from the graph to a numpy array\n",
    "# Array of size (n_triples, 3)\n",
    "# We just want the triples where the predicate is either:\n",
    "# - related_to\n",
    "# - has_author\n",
    "# - written_in_institution\n",
    "# - related_to_concept\n",
    "# - references\n",
    "# - is_part_of\n",
    "triples_generator = list(g.triples((None, unis.related_to, None)))\n",
    "triples_generator += list(g.triples((None, unis.has_author, None)))\n",
    "triples_generator += list(g.triples((None, unis.written_in_institution, None)))\n",
    "triples_generator += list(g.triples((None, unis.related_to_concept, None)))\n",
    "triples_generator += list(g.triples((None, unis.institution_related_to_concept, None)))\n",
    "triples_generator += list(g.triples((None, unis.references, None)))\n",
    "triples_generator += list(g.triples((None, unis.is_part_of, None)))\n",
    "triples = np.array(\n",
    "    [(str(s), str(p), str(o)) for s, p, o in triples_generator]\n",
    ")  # (subject, predicate, object) triples\n",
    "\n",
    "# Convert the objects to their string representation\n",
    "# Split the triples into train, valid, and test sets (80%, 10%, 10%)\n",
    "X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=0.2)\n",
    "X_valid, X_test = train_test_split_no_unseen(X_valid, test_size=0.5, allow_duplication=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store the triples in a file\n",
    "np.save(\"train.npy\", X_train)\n",
    "np.save(\"valid.npy\", X_valid)\n",
    "np.save(\"test.npy\", X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the triples from the file\n",
    "X_train = np.load(\"train.npy\")\n",
    "X_valid = np.load(\"valid.npy\")\n",
    "X_test = np.load(\"test.npy\")\n",
    "\n",
    "print(f\"Train size: {X_train.shape[0]}\")\n",
    "print(f\"Valid size: {X_valid.shape[0]}\")\n",
    "print(f\"Test size: {X_test.shape[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run the evaluation procedure on the test set (with filtering)\n",
    "# To disable filtering: use_filter=None\n",
    "# Usually, we corrupt subject and object sides separately and compute ranks\n",
    "ranks = model.evaluate(X_test, use_filter=filter, corrupt_side=\"s,o\")\n",
    "\n",
    "# compute and print metrics:\n",
    "mrr = mrr_score(ranks)\n",
    "hits_10 = hits_at_n_score(ranks, n=10)\n",
    "print(\"MRR: %f, Hits@10: %f\" % (mrr, hits_10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store the model\n",
    "super(ScoringBasedEmbeddingModel, model).save_weights(\"model/\")\n",
    "model.save_metadata(filedir='model')\n",
    "#from ampligraph.utils import save_model\n",
    "#save_model(model, model_name_path='model.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate the embeddings for entities and relations in the graph\n",
    "# and store them in numpy arrays\n",
    "all_ids_institutions = np.array(\n",
    "    [\n",
    "        (str(x), str(name), int(num_articles))\n",
    "        for (x, name, num_articles) in g.query(\n",
    "            \"\"\"SELECT DISTINCT ?s ?name (COUNT (?article) AS ?num_articles)\n",
    "        WHERE {\n",
    "                ?s a <urn:acmcmc:unis:Institution> .\n",
    "                ?s <urn:acmcmc:unis:name> ?name .\n",
    "                ?article <urn:acmcmc:unis:written_in_institution> ?s .\n",
    "                ?article ?related_to <https://openalex.org/C204321447>\n",
    "        }\n",
    "        GROUP BY ?s ?name\n",
    "        \"\"\"\n",
    "        )\n",
    "    ]\n",
    ")\n",
    "print(all_ids_institutions.shape)\n",
    "print(all_ids_institutions[0])\n",
    "entity_embeddings = model.get_embeddings(entities=all_ids_institutions[:, 0])\n",
    "display(entity_embeddings.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PCA\n",
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=2)\n",
    "pca.fit(entity_embeddings)\n",
    "entity_embeddings_pca = pca.transform(entity_embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ampligraph.discovery import find_clusters\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)\n",
    "clusters = find_clusters(all_ids_institutions[:,0], model, clustering_algorithm, mode=\"e\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_df = pd.DataFrame(\n",
    "    {\n",
    "        \"institution\": all_ids_institutions[:, 0],\n",
    "        \"institution_name\": all_ids_institutions[:, 1],\n",
    "        \"embedding1\": entity_embeddings_pca[:, 0],\n",
    "        \"embedding2\": entity_embeddings_pca[:, 1],\n",
    "        \"cluster\": \"cluster\" + pd.Series(clusters).astype(str),\n",
    "        \"num_articles\": all_ids_institutions[:, 2].astype(int),\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from adjustText import adjust_text\n",
    "\n",
    "\n",
    "def plot_clusters(parameter):\n",
    "    np.random.seed(0)\n",
    "    plt.figure(figsize=(12, 12))\n",
    "    plt.title(\"{} embeddings\".format(parameter).capitalize())\n",
    "    ax = sns.scatterplot(\n",
    "        data=plot_df,\n",
    "        x=\"embedding1\",\n",
    "        y=\"embedding2\",\n",
    "        hue=parameter,\n",
    "    )\n",
    "    texts = []\n",
    "    for i, point in plot_df.iterrows():\n",
    "        if point[\"institution\"] in [\"https://openalex.org/I161318765\", 'https://openalex.org/I1174212', 'https://openalex.org/I95457486']:\n",
    "            print(point)\n",
    "            texts.append(\n",
    "                plt.text(\n",
    "                    point[\"embedding1\"] + 0.02,\n",
    "                    point[\"embedding2\"] + 0.01,\n",
    "                    str(point[\"institution_name\"]),\n",
    "                )\n",
    "            )\n",
    "    #    texts.append(\n",
    "    #        plt.text(\n",
    "    #            point[\"embedding1\"] + 0.02,\n",
    "    #            point[\"embedding2\"] + 0.01,\n",
    "    #            str(point[\"institutions\"]),\n",
    "    #        )\n",
    "    #    )\n",
    "    adjust_text(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_clusters(\"num_articles\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ampligraph.discovery import discover_facts\n",
    "\n",
    "discover_facts(\n",
    "    filter['test'],\n",
    "    model,\n",
    "    top_n=100,\n",
    "    strategy=\"random_uniform\",\n",
    "    max_candidates=100,\n",
    "    target_rel=\"urn:acmcmc:unis:related_to_concept\",\n",
    "    seed=0,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a dataframe of the institutions and their names\n",
    "import pandas as pd\n",
    "query_results = g.query(\n",
    "    \"\"\"\n",
    "    SELECT DISTINCT ?institution ?name\n",
    "    WHERE {\n",
    "        ?institution a unis:Institution .\n",
    "        ?institution unis:name ?name .\n",
    "    }\n",
    "    \"\"\"\n",
    ")\n",
    "institutions = pd.DataFrame(query_results, columns=[\"institution\", \"name\"])\n",
    "institutions[\"institution\"] = institutions[\"institution\"].apply(lambda x: str(x))\n",
    "institutions[\"name\"] = institutions[\"name\"].apply(lambda x: str(x))\n",
    "# Store the dataframe\n",
    "institutions.to_csv(\"institutions.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "universities-kge",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}