Spaces:

acmc
/

Universities-Explorer

Running

App Files Files Community

aldan.creo commited on Dec 12, 2023

Commit

cdd672b

•

0 Parent(s):

First commit

Browse files

Files changed (22) hide show

.env.template +3 -0
.gitattributes +21 -0
.gitignore +3 -0
README.md +10 -0
app.py +370 -0
explore.ipynb +536 -0
institutions.csv +0 -0
model/.data-00000-of-00001 +3 -0
model/.index +3 -0
model/checkpoint +3 -0
model/model_metadata.ampkl +3 -0
requirements.txt +131 -0
test.csv +3 -0
test.py +12 -0
train.csv +3 -0
train.py +215 -0
universities.ttl +3 -0
universities_large.ttl +3 -0
universities_large_1200.ttl +3 -0
universities_large_4200.ttl +3 -0
universities_large_4300.ttl +3 -0
valid.csv +3 -0

.env.template ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:876dccf69e76c840d490e31392e63a465919635c812c2113cbc9c445f8af616b
+size 9

.gitattributes ADDED Viewed

	@@ -0,0 +1,21 @@

+universities.ttl filter=lfs diff=lfs merge=lfs -text
+model/checkpoint filter=lfs diff=lfs merge=lfs -text
+model/model_metadata.ampkl filter=lfs diff=lfs merge=lfs -text
+model/.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+model/.index filter=lfs diff=lfs merge=lfs -text
+model/ filter=lfs diff=lfs merge=lfs -text
+*.template filter=lfs diff=lfs merge=lfs -text
+*.gitignore filter=lfs diff=lfs merge=lfs -text
+*.ttl filter=lfs diff=lfs merge=lfs -text
+*.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+*.index filter=lfs diff=lfs merge=lfs -text
+/model/checkpoint filter=lfs diff=lfs merge=lfs -text
+*.ampkl filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+universities_large_1200.ttl filter=lfs diff=lfs merge=lfs -text
+universities_large_4300.ttl filter=lfs diff=lfs merge=lfs -text
+universities_large_4200.ttl filter=lfs diff=lfs merge=lfs -text
+universities_large.ttl filter=lfs diff=lfs merge=lfs -text
+test.csv filter=lfs diff=lfs merge=lfs -text
+train.csv filter=lfs diff=lfs merge=lfs -text
+valid.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51864a12e4a74f37f6cd4399f8de2d35f8c3b60179ae94d03de408d9366411b6
+size 35

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Universities Explorer
+emoji: 🏢
+colorFrom: pink
+colorTo: blue
+sdk: gradio
+sdk_version: 4.8.0
+app_file: app.py
+pinned: true
+---

app.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# %%
+import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import rdflib
+import seaborn as sns
+import tensorflow as tf
+from adjustText import adjust_text
+from ampligraph.latent_features import ScoringBasedEmbeddingModel
+from ampligraph.utils import restore_model
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+import logging
+logger = logging.getLogger(__name__)
+# Start timer, count time to load graph
+start_time = tf.timestamp()
+g = rdflib.Graph()
+uri = "urn:acmcmc:unis:"
+unis = rdflib.Namespace(uri)
+g.bind("unis", unis)
+g.parse("universities.ttl", format="turtle")
+# End timer
+end_time = tf.timestamp()
+logger.info("Graph loaded in {} seconds".format(end_time - start_time))
+# model = restore_model("model.pkl")
+# Start timer, count time to load model
+start_time = tf.timestamp()
+model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx")
+model.load_metadata("model/model")
+model.build_full_model()
+super(ScoringBasedEmbeddingModel, model).load_weights("model/")
+# End timer
+end_time = tf.timestamp()
+logger.info("Model loaded in {} seconds".format(end_time - start_time))
+def separate_concepts(concepts):
+    concept_list = concepts.split(",")
+    # Trim the strings
+    concept_list = [x.strip() for x in concept_list]
+    return concept_list
+def pca(embeddings):
+    pca = PCA(n_components=2)
+    pca.fit(embeddings)
+    entity_embeddings_pca = pca.transform(embeddings)
+    return entity_embeddings_pca
+def cluster(embeddings):
+    clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
+    clusters = clustering_algorithm.fit_predict(embeddings)
+    return clusters
+def get_concept_name(concept_uri):
+    """
+    Get the name of the concept from the URI
+    """
+    results = g.query(
+        f"""SELECT ?name
+        WHERE {{
+                <{concept_uri}> <urn:acmcmc:unis:name> ?name .
+        }}"""
+    )
+    return pd.DataFrame(results)[0][0]
+def get_similarities_to_node(array_of_triples, model):
+    """
+    Calculate the similarity between the embeddings of a node and a list of other nodes
+    """
+    # Cosine similarity using tensorflow
+    indexes = model.get_indexes(array_of_triples)
+    scores = model(indexes)
+    return scores
+def process_user_input_concept(concept_chooser):
+    """
+    The user input is the URI of the concept. Get the similarites between the concept and the institutions
+    """
+    all_ids_institutions = np.loadtxt(
+        "institutions.csv", delimiter=",", skiprows=1, dtype=str, quotechar='"'
+    )
+    # Remove duplicates based on the first column
+    all_ids_institutions = all_ids_institutions[
+        ~pd.DataFrame(all_ids_institutions).duplicated(0)
+    ]
+    chosen_concepts = separate_concepts(concept_chooser)
+    all_similarities = []
+    for concept in chosen_concepts:
+        s = all_ids_institutions[:, 0]
+        p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s))
+        o = np.array([concept] * len(s))
+        array_of_triples = np.array([s, p, o]).T
+        scores = get_similarities_to_node(array_of_triples, model)
+        all_similarities.append(scores)
+    # Now, average the similarities
+    scores = np.stack(all_similarities, axis=0)
+    scores = np.mean(all_similarities, axis=0)
+    table_df = pd.DataFrame(
+        {
+            "institution": s,
+            "similarity": scores.flatten(),
+            "institution_name": all_ids_institutions[:, 1],
+            # "num_articles": all_ids_institutions[:, 2].astype(int),
+        }
+    )
+    # Sort by number of articles
+    table_df = table_df.sort_values(by=["similarity"], ascending=False)
+    concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
+    return (
+        table_df,
+        gr.update(visible=True),
+        gr.update(visible=True),
+        gr.update(visible=True),
+        f'Concept names: {", ".join(concept_names)}',
+    )
+def calculate_emdeddings_and_pca(table):
+    gr.Info("Performing PCA and clustering...")
+    # Perform PCA
+    embeddings_of_institutions = model.get_embeddings(
+        entities=np.array(table["institution"])
+    )
+    entity_embeddings_pca = pca(embeddings_of_institutions)
+    # Perform clustering
+    clusters = cluster(embeddings_of_institutions)
+    plot_df = pd.DataFrame(
+        {
+            "embedding1": entity_embeddings_pca[:, 0],
+            "embedding2": entity_embeddings_pca[:, 1],
+            "cluster": "cluster" + pd.Series(clusters).astype(str),
+        }
+    )
+    # Toast message
+    gr.Info("PCA and clustering done!")
+    return plot_df
+def click_on_institution(table, embeddings_var, evt: gr.SelectData):
+    institution_id = table["institution"][evt.index[0]]
+    try:
+        embeddings_df = embeddings_var["embeddings_df"]
+        plot_df = pd.DataFrame(
+            {
+                "institution": table["institution"].values,
+                "institution_name": table["institution_name"].values,
+                "embedding1": embeddings_df["embedding1"].values,
+                "embedding2": embeddings_df["embedding2"].values,
+                "cluster": embeddings_df["cluster"].values,
+                # "num_articles": table["num_articles"].values,
+            }
+        )
+        return plot_embeddings(plot_df, institution_id)
+    except:
+        pass
+def click_on_show_plot(table):
+    embeddings_df = calculate_emdeddings_and_pca(table)
+    plot_df = pd.DataFrame(
+        {
+            "institution": table["institution"].values,
+            "institution_name": table["institution_name"].values,
+            "embedding1": embeddings_df["embedding1"].values,
+            "embedding2": embeddings_df["embedding2"].values,
+            "cluster": embeddings_df["cluster"].values,
+            # "num_articles": table["num_articles"].values,
+        }
+    )
+    fig = plot_embeddings(plot_df, None)
+    return fig, {"embeddings_df": plot_df}
+def plot_embeddings(plot_df, institution_id):
+    fig = plt.figure(figsize=(12, 12))
+    np.random.seed(0)
+    # fig.title("{} embeddings".format(parameter).capitalize())
+    ax = sns.scatterplot(
+        data=plot_df,
+        x="embedding1",
+        y="embedding2",
+        hue="cluster",
+    )
+    row_of_institution = plot_df[plot_df["institution"] == institution_id]
+    if not row_of_institution.empty:
+        ax.text(
+            row_of_institution["embedding1"],
+            row_of_institution["embedding2"],
+            row_of_institution["institution_name"].values[0],
+            horizontalalignment="left",
+            size="medium",
+            color="black",
+            weight="normal",
+        )
+        # Also draw a point for the institution
+        ax.scatter(
+            row_of_institution["embedding1"],
+            row_of_institution["embedding2"],
+            color="black",
+            s=100,
+            marker="x",
+        )
+    # texts = []
+    # for i, point in plot_df.iterrows():
+    #    if point["institution"] == institution_id:
+    #        texts.append(
+    #            fig.text(
+    #                point["embedding1"] + 0.02,
+    #                point["embedding2"] + 0.01,
+    #                str(point["institution_name"]),
+    #            )
+    #        )
+    # adjust_text(texts)
+    return fig
+def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.SelectData):
+    """
+    Get the authors of an institution
+    """
+    institution = institutions_table["institution"][0]
+    number_of_row = evt.index[0]
+    institution = institutions_table["institution"][number_of_row]
+    concepts = separate_concepts(concept_chooser)
+    results_dfs = []
+    for concept in concepts:
+        # Create a dataframe of the authors and the number of articles they have written for each concept
+        result = g.query(
+            f"""SELECT ?author ?name (COUNT (?article) AS ?num_articles)
+            WHERE {{
+                    ?author a <urn:acmcmc:unis:Author> .
+                    ?author <urn:acmcmc:unis:name> ?name .
+                    ?article <urn:acmcmc:unis:written_in_institution> <{institution}> .
+                    ?article <urn:acmcmc:unis:has_author> ?author .
+                    ?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
+            }}
+            GROUP BY ?author ?name
+            ORDER BY DESC(COUNT (?article))
+            """
+        )
+        result_df = pd.DataFrame(result)
+        result_df.columns = ["author", "name", "num_articles"]
+        results_dfs.append(result_df)
+    # Now, aggregate the results into a single dataframe by summing the number of articles
+    results_df = pd.concat(results_dfs)
+    results_df = results_df.groupby(["author", "name"]).sum().reset_index()
+    # Sort by number of articles
+    results_df = results_df.sort_values(by=["num_articles"], ascending=False)
+    return results_df, gr.update(visible=True)
+# %%
+theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia")
+with gr.Blocks(theme=theme) as demo:
+    embeddings_df = gr.State({})
+    # App title and description
+    title = gr.Markdown(
+        """
+        # Universities Explorer
+        This app allows you to explore the institutions more closely related to a concept.
+        It uses embeddings of institutions and concepts to calculate the similarity between them. The embedding model, [ComplEx](https://doi.org/10.48550/arXiv.1606.06357), was trained using the [AmpliGraph](https://github.com/Accenture/AmpliGraph) library. The data comes from the [OpenAlex](https://openalex.org/) dataset, which contains information about scientific articles, authors, institutions, and concepts.
+        """
+    )
+    with gr.Group() as institution_search:
+        concept_chooser = gr.Textbox(
+            label="Concept URI",
+            info="Using OpenAlex, find the URI of the concept you want to search for. For example, the URI of the concept 'Knowledge Graph' is https://openalex.org/C2987255567, while the URI of the concept 'Natural Language Processing' is https://openalex.org/C204321447. You can find the URI of a concept by searching for it on OpenAlex and copying the URL from the address bar. You can also search for multiple concepts by separating them with a comma.",
+            placeholder="https://openalex.org/C2987255567, https://openalex.org/C204321447",
+            value="https://openalex.org/C2987255567, https://openalex.org/C204321447",
+        )
+        concept_name_label = gr.Markdown("Concept name: ", visible=False)
+        # Table for name of institution and similarity to concept
+        btn_search_institutions = gr.Button("Search institutions", variant="primary")
+        table = gr.Dataframe(
+            interactive=False, visible=False, elem_classes="institutions", wrap=True
+        )
+        btn_search_institutions.click(
+            lambda: gr.update(visible=True), outputs=[table], queue=True
+        )
+    btn_plot_embeddings = gr.Button(
+        "Plot embeddings", variant="primary", visible=False, elem_classes="embeddings"
+    )
+    # Description of what plot embeddings does
+    plot_embeddings_info = gr.Markdown(
+        """
+        This button will plot the embeddings of the institutions related to the concept. The embeddings are calculated using the trained model and then reduced to 2 dimensions using PCA. The institutions are then clustered using KMeans.
+        Running this may take a while, as we need to calculate the embeddings for all institutions and then perform PCA and clustering.
+        """,
+        visible=False,
+    )
+    btn_search_institutions.click(
+        process_user_input_concept,
+        inputs=[concept_chooser],
+        outputs=[
+            table,
+            btn_plot_embeddings,
+            plot_embeddings_info,
+            concept_name_label,
+            concept_name_label,
+        ],
+        queue=True,
+    )
+    plot = gr.Plot(visible=False, elem_classes="embeddings")
+    btn_plot_embeddings.click(
+        lambda: gr.update(visible=True), outputs=[plot], queue=True
+    )
+    btn_plot_embeddings.click(
+        click_on_show_plot,
+        inputs=[table],
+        outputs=[plot, embeddings_df],
+        queue=True,
+    )
+    # When the user selects a row in the table, get the authors of that institution and display them in a dataframe
+    with gr.Group(visible=False, elem_classes="authors") as authors:
+        table_authors = gr.Dataframe(
+            interactive=False, label="Authors in institution writing about concept"
+        )
+        table.select(
+            get_authors_of_institution,
+            inputs=[table, concept_chooser],
+            outputs=[table_authors],
+        )
+        table.select(
+            click_on_institution,
+            inputs=[table, embeddings_df],
+            outputs=[plot],
+        )
+    btn_clear = gr.ClearButton(components=[table, plot, table_authors])
+    # Author information
+    author_info = gr.Markdown(
+        """
+        This demo has been built by [Aldan Creo](
+        https://acmc-website.web.app/).
+        """
+    )
+demo.queue()
+demo.launch()

explore.ipynb ADDED Viewed

	@@ -0,0 +1,536 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyalex\n",
+    "import dotenv\n",
+    "import os\n",
+    "from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import numpy as np\n",
+    "from ampligraph.evaluation import train_test_split_no_unseen\n",
+    "\n",
+    "dotenv.load_dotenv()\n",
+    "\n",
+    "pyalex.config.email = os.getenv(\"MY_EMAIL\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knowledge_graphs = Concepts().search(\"knowledge graph\").count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rdflib\n",
+    "\n",
+    "g = rdflib.Graph()\n",
+    "uri = \"urn:acmcmc:unis:\"\n",
+    "unis = rdflib.Namespace(uri)\n",
+    "g.bind(\"unis\", unis)\n",
+    "# g.parse(\"universities_large_1200.ttl\", format=\"turtle\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def store_graph():\n",
+    "    g.serialize(destination='universities_large.ttl', format='turtle')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "articles = (\n",
+    "    Works()\n",
+    "    .search_filter(abstract=\"Large Language Model Knowledge Graph\")\n",
+    "    .filter(authorships={\"institutions\": {\"country_code\": \"US\", \"type\": \"education\"}})\n",
+    ")\n",
+    "articles = Works().filter(\n",
+    "    concepts={\"id\": \"C2987255567|C204321447|C41008148\"},\n",
+    "    # C2987255567: Knowledge Graph\n",
+    "    # C204321447: Natural Language Processing\n",
+    "    # C41008148 : Computer Science\n",
+    "    authorships={\"institutions\": {\"country_code\": \"US\", \"type\": \"education\"}},\n",
+    ").sort(publication_date=\"desc\")\n",
+    "print(f\"Found {articles.count()} articles. Fetching...\")\n",
+    "\n",
+    "if articles.count() > 1000:\n",
+    "    print(\"Too many articles. Loading from file.\")\n",
+    "    g.parse(\"universities_large_1200.ttl\", format=\"turtle\")\n",
+    "else:\n",
+    "    all_articles = []\n",
+    "    num_articles_concepts = {\n",
+    "        \"https://openalex.org/C2987255567\": 0,\n",
+    "        \"https://openalex.org/C204321447\": 0,\n",
+    "        \"https://openalex.org/C41008148\": 0,\n",
+    "    }\n",
+    "    # Go through all pages\n",
+    "    paginator = articles.paginate(per_page=200, n_max=1000000)\n",
+    "    for i, page in enumerate(paginator):\n",
+    "        print(f\"Processing page {i}\")\n",
+    "        if i > 0 and i % 100 == 0:\n",
+    "            store_graph()\n",
+    "        for article in page:\n",
+    "            all_articles.append(article)\n",
+    "            article_uri = rdflib.URIRef(article[\"id\"])\n",
+    "            g.add((article_uri, rdflib.RDF.type, unis.Article))\n",
+    "            g.add((article_uri, unis.title, rdflib.Literal(article[\"title\"])))\n",
+    "            # Related to is a list of ids\n",
+    "            for related_to in article[\"related_works\"]:\n",
+    "                g.add((article_uri, unis.related_to, rdflib.URIRef(related_to)))\n",
+    "            for reference in article[\"referenced_works\"]:\n",
+    "                g.add((article_uri, unis.references, rdflib.URIRef(reference)))\n",
+    "            # Authors is a list of dicts\n",
+    "            for author in article[\"authorships\"]:\n",
+    "                author_uri = rdflib.URIRef(author[\"author\"][\"id\"])\n",
+    "                g.add((author_uri, rdflib.RDF.type, unis.Author))\n",
+    "                g.add(\n",
+    "                    (\n",
+    "                        author_uri,\n",
+    "                        unis.name,\n",
+    "                        rdflib.Literal(author[\"author\"][\"display_name\"]),\n",
+    "                    )\n",
+    "                )\n",
+    "                g.add((article_uri, unis.has_author, author_uri))\n",
+    "                for institution in author[\"institutions\"]:\n",
+    "                    institution_uri = rdflib.URIRef(institution[\"id\"])\n",
+    "                    g.add((institution_uri, rdflib.RDF.type, unis.Institution))\n",
+    "                    # g.add((author_uri, unis.affiliated_to, institution_uri)) # Do not add this, because the author might be affiliated to multiple institutions at different times\n",
+    "                    g.add(\n",
+    "                        (\n",
+    "                            article_uri,\n",
+    "                            unis.written_in_institution,\n",
+    "                            institution_uri,\n",
+    "                        )\n",
+    "                    )\n",
+    "                    g.add(\n",
+    "                        (\n",
+    "                            institution_uri,\n",
+    "                            unis.country,\n",
+    "                            rdflib.Literal(institution[\"country_code\"]),\n",
+    "                        )\n",
+    "                    )\n",
+    "                    g.add(\n",
+    "                        (\n",
+    "                            institution_uri,\n",
+    "                            unis.name,\n",
+    "                            rdflib.Literal(institution[\"display_name\"]),\n",
+    "                        )\n",
+    "                    )\n",
+    "                    for parent_institution_id in institution[\"lineage\"]:\n",
+    "                        parent_institution_uri = rdflib.URIRef(parent_institution_id)\n",
+    "                        g.add(\n",
+    "                            (parent_institution_uri, rdflib.RDF.type, unis.Institution)\n",
+    "                        )\n",
+    "                        g.add(\n",
+    "                            (institution_uri, unis.is_part_of, parent_institution_uri)\n",
+    "                        )\n",
+    "                    # Concepts is a list of dicts\n",
+    "                    for concept in [c for c in article[\"concepts\"] if c[\"score\"] > 0.4]:\n",
+    "                        concept_uri = rdflib.URIRef(concept[\"id\"])\n",
+    "                        g.add((concept_uri, rdflib.RDF.type, unis.Concept))\n",
+    "                        g.add(\n",
+    "                            (\n",
+    "                                institution_uri,\n",
+    "                                unis.institution_related_to_concept,\n",
+    "                                concept_uri,\n",
+    "                            )\n",
+    "                        )\n",
+    "                        # Count the concepts\n",
+    "                        if concept[\"id\"] in num_articles_concepts:\n",
+    "                            num_articles_concepts[concept[\"id\"]] += 1\n",
+    "            # Concepts is a list of dicts\n",
+    "            for concept in [c for c in article[\"concepts\"] if c[\"score\"] > 0.4]:\n",
+    "                concept_uri = rdflib.URIRef(concept[\"id\"])\n",
+    "                g.add((concept_uri, rdflib.RDF.type, unis.Concept))\n",
+    "                g.add((article_uri, unis.related_to_concept, concept_uri))\n",
+    "                g.add((concept_uri, unis.name, rdflib.Literal(concept[\"display_name\"])))\n",
+    "        # print the numbers of articles per concept\n",
+    "        print(num_articles_concepts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Knogledge Extraction rule: if we have institution I, a paper P, and P is related to concept C, then C is related to I\n",
+    "# Add triples to the graph for this rule\n",
+    "query_results = g.query(\n",
+    "    \"\"\"\n",
+    "    SELECT DISTINCT ?institution ?concept\n",
+    "    WHERE {\n",
+    "        ?institution a unis:Institution .\n",
+    "        ?article a unis:Article .\n",
+    "        ?concept a unis:Concept .\n",
+    "        ?article unis:written_in_institution ?institution .\n",
+    "        ?article unis:related_to_concept ?concept .\n",
+    "    }\n",
+    "    \"\"\"\n",
+    ")\n",
+    "# Print the number of results\n",
+    "print(f\"Found {len(query_results)} results for the rule.\")\n",
+    "for i, row in enumerate(query_results):\n",
+    "    if i % 1000 == 0:\n",
+    "        print(f\"Processing rule {i}\")\n",
+    "    g.add((row[0], unis.institution_related_to_concept, row[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = Works().search_filter(abstract=\"Large Language Model Knowledge Graph\").group_by(\n",
+    "    \"authorships.institutions.id\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Found {results.count()} articles. Fetching...\")\n",
+    "\n",
+    "df = pd.DataFrame(results.get())\n",
+    "\n",
+    "display(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "store_graph()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph\n",
+    "#import networkx as nx\n",
+    "#import matplotlib.pyplot as plt\n",
+    "#\n",
+    "#G = rdflib_to_networkx_multidigraph(g)\n",
+    "#\n",
+    "## Plot Networkx instance of RDF Graph\n",
+    "#pos = nx.spring_layout(G, scale=0.1)\n",
+    "#edge_labels = nx.get_edge_attributes(G, \"r\")\n",
+    "#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)\n",
+    "#nx.draw(G, with_labels=True)\n",
+    "#\n",
+    "## if not in interactive mode for\n",
+    "#plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Get the triples from the graph to a numpy array\n",
+    "# Array of size (n_triples, 3)\n",
+    "# We just want the triples where the predicate is either:\n",
+    "# - related_to\n",
+    "# - has_author\n",
+    "# - written_in_institution\n",
+    "# - related_to_concept\n",
+    "# - references\n",
+    "# - is_part_of\n",
+    "triples_generator = list(g.triples((None, unis.related_to, None)))\n",
+    "triples_generator += list(g.triples((None, unis.has_author, None)))\n",
+    "triples_generator += list(g.triples((None, unis.written_in_institution, None)))\n",
+    "triples_generator += list(g.triples((None, unis.related_to_concept, None)))\n",
+    "triples_generator += list(g.triples((None, unis.institution_related_to_concept, None)))\n",
+    "triples_generator += list(g.triples((None, unis.references, None)))\n",
+    "triples_generator += list(g.triples((None, unis.is_part_of, None)))\n",
+    "triples = np.array(\n",
+    "    [(str(s), str(p), str(o)) for s, p, o in triples_generator]\n",
+    ")  # (subject, predicate, object) triples\n",
+    "\n",
+    "# Convert the objects to their string representation\n",
+    "# Split the triples into train, valid, and test sets (80%, 10%, 10%)\n",
+    "X_train, X_valid = train_test_split_no_unseen(np.array(triples), test_size=0.2)\n",
+    "X_valid, X_test = train_test_split_no_unseen(X_valid, test_size=0.5, allow_duplication=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store the triples in a file\n",
+    "np.save(\"train.npy\", X_train)\n",
+    "np.save(\"valid.npy\", X_valid)\n",
+    "np.save(\"test.npy\", X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the triples from the file\n",
+    "X_train = np.load(\"train.npy\")\n",
+    "X_valid = np.load(\"valid.npy\")\n",
+    "X_test = np.load(\"test.npy\")\n",
+    "\n",
+    "print(f\"Train size: {X_train.shape[0]}\")\n",
+    "print(f\"Valid size: {X_valid.shape[0]}\")\n",
+    "print(f\"Test size: {X_test.shape[0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the evaluation procedure on the test set (with filtering)\n",
+    "# To disable filtering: use_filter=None\n",
+    "# Usually, we corrupt subject and object sides separately and compute ranks\n",
+    "ranks = model.evaluate(X_test, use_filter=filter, corrupt_side=\"s,o\")\n",
+    "\n",
+    "# compute and print metrics:\n",
+    "mrr = mrr_score(ranks)\n",
+    "hits_10 = hits_at_n_score(ranks, n=10)\n",
+    "print(\"MRR: %f, Hits@10: %f\" % (mrr, hits_10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Store the model\n",
+    "super(ScoringBasedEmbeddingModel, model).save_weights(\"model/\")\n",
+    "model.save_metadata(filedir='model')\n",
+    "#from ampligraph.utils import save_model\n",
+    "#save_model(model, model_name_path='model.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate the embeddings for entities and relations in the graph\n",
+    "# and store them in numpy arrays\n",
+    "all_ids_institutions = np.array(\n",
+    "    [\n",
+    "        (str(x), str(name), int(num_articles))\n",
+    "        for (x, name, num_articles) in g.query(\n",
+    "            \"\"\"SELECT DISTINCT ?s ?name (COUNT (?article) AS ?num_articles)\n",
+    "        WHERE {\n",
+    "                ?s a <urn:acmcmc:unis:Institution> .\n",
+    "                ?s <urn:acmcmc:unis:name> ?name .\n",
+    "                ?article <urn:acmcmc:unis:written_in_institution> ?s .\n",
+    "                ?article ?related_to <https://openalex.org/C204321447>\n",
+    "        }\n",
+    "        GROUP BY ?s ?name\n",
+    "        \"\"\"\n",
+    "        )\n",
+    "    ]\n",
+    ")\n",
+    "print(all_ids_institutions.shape)\n",
+    "print(all_ids_institutions[0])\n",
+    "entity_embeddings = model.get_embeddings(entities=all_ids_institutions[:, 0])\n",
+    "display(entity_embeddings.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PCA\n",
+    "from sklearn.decomposition import PCA\n",
+    "pca = PCA(n_components=2)\n",
+    "pca.fit(entity_embeddings)\n",
+    "entity_embeddings_pca = pca.transform(entity_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ampligraph.discovery import find_clusters\n",
+    "from sklearn.cluster import KMeans\n",
+    "\n",
+    "clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)\n",
+    "clusters = find_clusters(all_ids_institutions[:,0], model, clustering_algorithm, mode=\"e\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"institution\": all_ids_institutions[:, 0],\n",
+    "        \"institution_name\": all_ids_institutions[:, 1],\n",
+    "        \"embedding1\": entity_embeddings_pca[:, 0],\n",
+    "        \"embedding2\": entity_embeddings_pca[:, 1],\n",
+    "        \"cluster\": \"cluster\" + pd.Series(clusters).astype(str),\n",
+    "        \"num_articles\": all_ids_institutions[:, 2].astype(int),\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from adjustText import adjust_text\n",
+    "\n",
+    "\n",
+    "def plot_clusters(parameter):\n",
+    "    np.random.seed(0)\n",
+    "    plt.figure(figsize=(12, 12))\n",
+    "    plt.title(\"{} embeddings\".format(parameter).capitalize())\n",
+    "    ax = sns.scatterplot(\n",
+    "        data=plot_df,\n",
+    "        x=\"embedding1\",\n",
+    "        y=\"embedding2\",\n",
+    "        hue=parameter,\n",
+    "    )\n",
+    "    texts = []\n",
+    "    for i, point in plot_df.iterrows():\n",
+    "        if point[\"institution\"] in [\"https://openalex.org/I161318765\", 'https://openalex.org/I1174212', 'https://openalex.org/I95457486']:\n",
+    "            print(point)\n",
+    "            texts.append(\n",
+    "                plt.text(\n",
+    "                    point[\"embedding1\"] + 0.02,\n",
+    "                    point[\"embedding2\"] + 0.01,\n",
+    "                    str(point[\"institution_name\"]),\n",
+    "                )\n",
+    "            )\n",
+    "    #    texts.append(\n",
+    "    #        plt.text(\n",
+    "    #            point[\"embedding1\"] + 0.02,\n",
+    "    #            point[\"embedding2\"] + 0.01,\n",
+    "    #            str(point[\"institutions\"]),\n",
+    "    #        )\n",
+    "    #    )\n",
+    "    adjust_text(texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_clusters(\"num_articles\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ampligraph.discovery import discover_facts\n",
+    "\n",
+    "discover_facts(\n",
+    "    filter['test'],\n",
+    "    model,\n",
+    "    top_n=100,\n",
+    "    strategy=\"random_uniform\",\n",
+    "    max_candidates=100,\n",
+    "    target_rel=\"urn:acmcmc:unis:related_to_concept\",\n",
+    "    seed=0,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a dataframe of the institutions and their names\n",
+    "import pandas as pd\n",
+    "query_results = g.query(\n",
+    "    \"\"\"\n",
+    "    SELECT DISTINCT ?institution ?name\n",
+    "    WHERE {\n",
+    "        ?institution a unis:Institution .\n",
+    "        ?institution unis:name ?name .\n",
+    "    }\n",
+    "    \"\"\"\n",
+    ")\n",
+    "institutions = pd.DataFrame(query_results, columns=[\"institution\", \"name\"])\n",
+    "institutions[\"institution\"] = institutions[\"institution\"].apply(lambda x: str(x))\n",
+    "institutions[\"name\"] = institutions[\"name\"].apply(lambda x: str(x))\n",
+    "# Store the dataframe\n",
+    "institutions.to_csv(\"institutions.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "universities-kge",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

institutions.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

model/.data-00000-of-00001 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5ded6f0bf7985926646dd021e03e008d0f8779f606e4010f0ab89cf8687e943
+size 87725277

model/.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d9027e082ae75293bde304a2044fbd0549aa0bd1b43d3483c7c28b0ab7bc72b
+size 291

model/checkpoint ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba7ac3757b9a57bdd3e603acb528728d61a9479fe392a7f343330aad23f22c50
+size 59

model/model_metadata.ampkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7a052e205b870dba54d5a4b23c54f638d93e880c81b66e14ec1c6ae90d2cd33
+size 24656298

requirements.txt ADDED Viewed

	@@ -0,0 +1,131 @@

+absl-py==2.0.0
+adjustText==0.8
+aiofiles==23.2.1
+alabaster==0.7.13
+altair==5.2.0
+ampligraph==2.0.1
+annotated-types==0.6.0
+anyio==3.7.1
+astunparse==1.6.3
+attrs==23.1.0
+Babel==2.13.1
+beautifultable==1.1.0
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contextlib2==21.6.0
+contourpy==1.2.0
+cycler==0.12.1
+docopt==0.6.2
+docutils==0.17.1
+fastapi==0.105.0
+ffmpy==0.3.1
+filelock==3.13.1
+flake8==6.1.0
+flatbuffers==23.5.26
+fonttools==4.46.0
+fsspec==2023.12.2
+gast==0.5.4
+google-auth==2.25.2
+google-auth-oauthlib==1.1.0
+google-pasta==0.2.0
+gradio==4.8.0
+gradio_client==0.7.1
+grpcio==1.60.0
+h11==0.14.0
+h5py==3.10.0
+httpcore==1.0.2
+httpx==0.25.2
+huggingface-hub==0.19.4
+idna==3.6
+imagesize==1.4.1
+importlib-resources==6.1.1
+iniconfig==2.0.0
+isodate==0.6.1
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+keras==2.15.0
+kiwisolver==1.4.5
+latexcodec==2.0.1
+libclang==16.0.6
+Markdown==3.5.1
+markdown-it-py==2.2.0
+MarkupSafe==2.1.3
+matplotlib==3.8.2
+mccabe==0.7.0
+mdit-py-plugins==0.3.5
+mdurl==0.1.2
+ml-dtypes==0.2.0
+myst-parser==0.18.0
+networkx==3.2.1
+numpy==1.26.2
+oauthlib==3.2.2
+opt-einsum==3.3.0
+orjson==3.9.10
+pandas==2.1.4
+Pillow==10.1.0
+pluggy==1.3.0
+protobuf==4.23.4
+pyalex==0.13
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pybtex==0.24.0
+pybtex-docutils==1.0.3
+pycodestyle==2.11.1
+pydantic==2.5.2
+pydantic_core==2.14.5
+pydub==0.25.1
+pyflakes==3.1.0
+pyparsing==3.1.1
+pytest==7.4.3
+python-dotenv==1.0.0
+python-multipart==0.0.6
+pytz==2023.3.post1
+PyYAML==6.0.1
+rdflib==7.0.0
+referencing==0.32.0
+requests==2.31.0
+requests-oauthlib==1.3.1
+rich==13.7.0
+rpds-py==0.13.2
+rsa==4.9
+schema==0.7.5
+scikit-learn==1.3.2
+scipy==1.10.0
+seaborn==0.13.0
+semantic-version==2.10.0
+shellingham==1.5.4
+sniffio==1.3.0
+snowballstemmer==2.2.0
+SPARQLWrapper==2.0.0
+Sphinx==5.0.2
+sphinx-rtd-theme==1.0.0
+sphinxcontrib-applehelp==1.0.7
+sphinxcontrib-bibtex==2.4.2
+sphinxcontrib-devhelp==1.0.5
+sphinxcontrib-htmlhelp==2.0.4
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.6
+sphinxcontrib-serializinghtml==1.1.9
+starlette==0.27.0
+tensorboard==2.15.1
+tensorboard-data-server==0.7.2
+tensorflow==2.15.0
+tensorflow-estimator==2.15.0
+tensorflow-io-gcs-filesystem==0.34.0
+termcolor==2.4.0
+threadpoolctl==3.2.0
+tomlkit==0.12.0
+toolz==0.12.0
+tqdm==4.66.1
+typer==0.9.0
+tzdata==2023.3
+urllib3==2.1.0
+uvicorn==0.24.0.post1
+websockets==11.0.3
+Werkzeug==3.0.1
+wrapt==1.14.1

test.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4853fc51e34ffde1e7d2bfc0b463b41d57b163442e8bd2ad748e038d635bb140
+size 73705888

test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import gradio as gr
+with gr.Blocks() as demo:
+    t1 = gr.Label("Hello world!")
+    btn = gr.Button("Click me")
+    t2 = gr.Label("Hello world!", visible=False)
+    def update():
+        return "abc", gr.update(visible=True)
+    btn.click(update, inputs=[], outputs=[t2, t2])
+# %%
+demo.launch()

train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3e08f7214b7ab53bc55eff7a07eddaff45202a1975bb3a526c4f7bc9e82f83d
+size 546683543

train.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# %%
+# Set logging level to DEBUG
+import logging
+import os
+import dotenv
+import numpy as np
+import pandas as pd
+import pyalex
+import rdflib
+from ampligraph.datasets import (
+    GraphDataLoader,
+    SQLiteAdapter,
+    DataSourceIdentifier,
+)
+from ampligraph.datasets.graph_partitioner import NaiveGraphPartitioner, BucketGraphPartitioner
+from ampligraph.evaluation import train_test_split_no_unseen
+from ampligraph.latent_features import ScoringBasedEmbeddingModel
+from pyalex import Authors, Concepts, Funders, Institutions, Publishers, Sources, Works
+from sklearn.model_selection import train_test_split
+import tensorflow as tf
+from ampligraph.evaluation import hits_at_n_score, mrr_score
+from ampligraph.latent_features import ScoringBasedEmbeddingModel
+from ampligraph.latent_features.loss_functions import get as get_loss
+from ampligraph.latent_features.regularizers import get as get_regularizer
+logging.basicConfig(level=logging.DEBUG)
+loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
+for logger in loggers:
+    logger.setLevel(logging.INFO)
+# Load the triples from the file
+X_train = np.load("train.npy")
+X_valid = np.load("valid.npy")
+X_test = np.load("test.npy")
+## Store as CSVs. There are commas in the names of some institutions, so we need to use a tab as the delimiter
+#np.savetxt("train.csv", X_train, delimiter="\t", fmt="%s")
+#np.savetxt("valid.csv", X_valid, delimiter="\t", fmt="%s")
+#np.savetxt("test.csv", X_test, delimiter="\t", fmt="%s")
+#
+#print(f"Train size: {X_train.shape[0]}")
+#print(f"Valid size: {X_valid.shape[0]}")
+#print(f"Test size: {X_test.shape[0]}")
+# Initialize a ComplEx neural embedding model: the embedding size is k,
+# eta specifies the number of corruptions to generate per each positive,
+# scoring_type determines the scoring function of the embedding model.
+partitioned_model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx")
+# Optimizer, loss and regularizer definition
+optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
+loss = get_loss("pairwise", {"margin": 0.5})
+regularizer = get_regularizer("LP", {"p": 2, "lambda": 1e-5})
+# Compilation of the model
+partitioned_model.compile(
+    optimizer=optim, loss=loss, entity_relation_regularizer=regularizer
+)
+# For evaluation, we can use a filter which would be used to filter out
+# positives statements created by the corruption procedure.
+# Here we define the filter set by concatenating all the positives
+filter = {"test": np.concatenate((X_train, X_valid, X_test))}
+# Early Stopping callback
+checkpoint = tf.keras.callbacks.EarlyStopping(
+    monitor="val_{}".format("hits10"),
+    min_delta=0,
+    patience=5,
+    verbose=1,
+    mode="max",
+    restore_best_weights=True,
+)
+######
+use_db = False
+if use_db:
+    AMPLIGRAPH_DATA_HOME = os.path.join(os.getcwd(), "data")  # + os.sep
+    from ampligraph.datasets.data_indexer import SQLite as SQLiteIndexer, DataIndexer
+    # Initialize GraphDataLoader from .csv file
+    sqlite_indexer = SQLiteIndexer(
+            data=None,
+            db_file="main_partition.db",
+            root_directory=AMPLIGRAPH_DATA_HOME,
+            name="main_partition",
+        )
+    indexer = DataIndexer(
+        X=None,
+        backend_type='sqlite',
+        backend=sqlite_indexer,
+    )
+    dataset_loader = GraphDataLoader(
+        "train.csv",
+        backend=SQLiteAdapter,
+        in_memory=False,
+        verbose=True,
+        root_directory=AMPLIGRAPH_DATA_HOME,
+        db_name="mydb.db",
+        use_indexer=indexer,
+    )
+    # adapter = SQLiteAdapter(
+    #    "database_25-12-2023_07-28-41_485047_PM_2a11fc49-2337-415e-8672-2bfa48a83745.db",
+    #    identifier=DataSourceIdentifier,
+    #    root_directory=AMPLIGRAPH_DATA_HOME,
+    # )
+    print("Graph data loader initialized")
+    # for elem in next(dataset_loader._get_batch_generator()):
+    #    print(elem)
+    #    break
+    ######
+else:
+    X_train = np.load("train.npy")
+    dataset_loader = GraphDataLoader(
+        X_train,
+        verbose=True,
+        use_indexer=True,
+        in_memory=True,
+    )
+    print(f'next: {next(dataset_loader)}')
+    print(f'next: {next(dataset_loader)}')
+    print(f'next: {next(dataset_loader)}')
+    #x = np.loadtxt(
+    #    "train.csv",
+    #    delimiter="\t",
+    #    dtype=str,
+    #)
+    #print(x[0])
+# Choose the partitioner - in this case we choose RandomEdges partitioner
+partition = False
+if partition:
+    print("Will start partitioning now")
+    graph_partitioner_train = NaiveGraphPartitioner(dataset_loader, k=6)
+    print("Graph partitioner initialized")
+    #indexer = (
+    #    partitioned_model.data_handler.get_mapper()
+    #)  # get the mapper from the trained model
+# dataset_loader_test = GraphDataLoader(
+#    data_source=X_test,
+#    backend=SQLiteAdapter,  # type of backend to use
+#    batch_size=400,  # batch size to use while iterating over this dataset
+#    dataset_type="test",  # dataset type
+#    use_indexer=indexer,  # mapper to map test concepts to the same indices used during training
+#    verbose=True,
+# )
+# graph_partitioner_test = BucketGraphPartitioner(data=partitioner, k=3)
+print("Will start training now")
+# Fit the model on training and validation set
+partitioned_model.fit(
+    #graph_partitioner_train if partition else dataset_loader,
+    X_train,
+    batch_size=500,
+    epochs=45,  # Number of training epochs
+    validation_freq=20,  # Epochs between successive validation
+    validation_burn_in=100,  # Epoch to start validation
+    validation_data=X_test,  # Validation data
+    validation_filter=filter,  # Filter positives from validation corruptions
+    callbacks=[
+        checkpoint
+    ],  # Early stopping callback (more from tf.keras.callbacks are supported)
+    verbose=True,  # Enable stdout messages
+    #partitioning_k=7,  # Number of partitions to create
+)
+# %%
+# Store the model
+super(ScoringBasedEmbeddingModel, partitioned_model).save_weights("model/")
+partitioned_model.save_metadata(filedir="model")
+# from ampligraph.utils import save_model
+# save_model(partitioned_model, model_name_path='model.pkl')
+# %%
+# Create a dataframe of the institutions and their names
+import pandas as pd
+import rdflib
+g = rdflib.Graph()
+uri = "urn:acmcmc:unis:"
+unis = rdflib.Namespace(uri)
+g.bind("unis", unis)
+g.parse("universities_large_1200.ttl", format="turtle")
+query_results = g.query(
+    """
+    SELECT DISTINCT ?institution ?name
+    WHERE {
+        ?institution a unis:Institution .
+        ?institution unis:name ?name .
+    }
+    """
+)
+institutions = pd.DataFrame(query_results, columns=["institution", "name"])
+institutions["institution"] = institutions["institution"].apply(lambda x: str(x))
+institutions["name"] = institutions["name"].apply(lambda x: str(x))
+# Store the dataframe
+institutions.to_csv("institutions.csv", index=False)
+# %%
+# Run the evaluation procedure on the test set (with filtering)
+# To disable filtering: use_filter=None
+# Usually, we corrupt subject and object sides separately and compute ranks
+ranks = partitioned_model.evaluate(X_test, use_filter=filter, corrupt_side="s,o")
+# compute and print metrics:
+mrr = mrr_score(ranks)
+hits_10 = hits_at_n_score(ranks, n=10)
+print("MRR: %f, Hits@10: %f" % (mrr, hits_10))

universities.ttl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb26d10e53b756c0a17940946cf06603826eb779847b5943cd35155e4257f636
+size 209243108

universities_large.ttl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e86756a309bf3e29aceb783f9d02cca057466c92862423501d13d9a08fd2ffa
+size 807256238

universities_large_1200.ttl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efd28c5a3d62c211b536439bad446658100ba11c26102478d97b3df5483b0dcb
+size 807269262

universities_large_4200.ttl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c1fc0caa955c41cc0ffd1cd8a903b8845de5f0b463e3c1d65ec94cdc3d71e9c
+size 1757505860

universities_large_4300.ttl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfadae62d79379f75bf86b7c30bd58753b65cdc97c69aafad796c60faaa84de4
+size 1818624628

valid.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0a30e621a7859e186970e2b4bc81e2bc9ffc6ece265d373e95a26733e397314
+size 71276234