Spaces:

acmc
/

Universities-Explorer

Running

File size: 13,696 Bytes

# %%
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rdflib
import seaborn as sns
import tensorflow as tf
from adjustText import adjust_text
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.utils import restore_model
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import logging


logger = logging.getLogger(__name__)

# Start timer, count time to load graph
start_time = tf.timestamp()

g = rdflib.Graph()
uri = "urn:acmcmc:unis:"
unis = rdflib.Namespace(uri)
g.bind("unis", unis)
g.parse("universities.ttl", format="turtle")

# End timer
end_time = tf.timestamp()
logger.info("Graph loaded in {} seconds".format(end_time - start_time))

# model = restore_model("model.pkl")

# Start timer, count time to load model
start_time = tf.timestamp()
model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx")
model.load_metadata("model/model")
model.build_full_model()
super(ScoringBasedEmbeddingModel, model).load_weights("model/")
# End timer
end_time = tf.timestamp()
logger.info("Model loaded in {} seconds".format(end_time - start_time))


def separate_concepts(concepts):
    concept_list = concepts.split(",")
    # Trim the strings
    concept_list = [x.strip() for x in concept_list]
    return concept_list


def pca(embeddings):
    pca = PCA(n_components=2)
    pca.fit(embeddings)
    entity_embeddings_pca = pca.transform(embeddings)
    return entity_embeddings_pca


def cluster(embeddings):
    clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
    clusters = clustering_algorithm.fit_predict(embeddings)
    return clusters


def get_concept_name(concept_uri):
    """
    Get the name of the concept from the URI
    """
    results = g.query(
        f"""SELECT ?name
        WHERE {{
                <{concept_uri}> <urn:acmcmc:unis:name> ?name .
        }}"""
    )
    return pd.DataFrame(results)[0][0]


def get_similarities_to_node(array_of_triples, model):
    """
    Calculate the similarity between the embeddings of a node and a list of other nodes
    """
    # Cosine similarity using tensorflow
    indexes = model.get_indexes(array_of_triples)
    scores = model(indexes)
    return scores


def process_user_input_concept(concept_chooser):
    """
    The user input is the URI of the concept. Get the similarites between the concept and the institutions
    """
    all_ids_institutions = np.loadtxt(
        "institutions.csv", delimiter=",", skiprows=1, dtype=str, quotechar='"'
    )
    # Remove duplicates based on the first column
    all_ids_institutions = all_ids_institutions[
        ~pd.DataFrame(all_ids_institutions).duplicated(0)
    ]

    chosen_concepts = separate_concepts(concept_chooser)
    chosen_concepts_names = [get_concept_name(concept) for concept in chosen_concepts]
    all_similarities = {}
    for concept in chosen_concepts:
        s = all_ids_institutions[:, 0]
        p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s))
        o = np.array([concept] * len(s))

        array_of_triples = np.array([s, p, o]).T

        scores = get_similarities_to_node(array_of_triples, model)
        all_similarities[concept] = scores

    # Now, average the similarities
    scores = np.stack(list(all_similarities.values()), axis=0)
    scores = np.mean(scores, axis=0)

    table_df = pd.DataFrame(
        {
            "institution": s,
            "mean_similarity": scores.flatten(),
            "institution_name": all_ids_institutions[:, 1],
            # "num_articles": all_ids_institutions[:, 2].astype(int),
        }
    )

    # Add the individual similarities
    for i, concept in enumerate(chosen_concepts):
        table_df[f"similarity_to_{chosen_concepts_names[i]}"] = all_similarities[concept]

    # Reorder the columns so that the mean similarity is after the individual similarities and before the institution name
    table_df = table_df[
        ["institution"]
        + [f"similarity_to_{chosen_concepts_names[i]}" for i in range(len(chosen_concepts))]
        + ["mean_similarity", "institution_name"]
    ]

    # Sort by mean similarity
    table_df = table_df.sort_values(by=["mean_similarity"], ascending=False)

    concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
    return (
        table_df,
        gr.update(visible=True),
        gr.update(visible=True),
        #gr.update(visible=True),
        #f'Concept names: {", ".join(concept_names)}',
    )


def calculate_emdeddings_and_pca(table):
    gr.Info("Performing PCA and clustering...")
    # Perform PCA
    embeddings_of_institutions = model.get_embeddings(
        entities=np.array(table["institution"])
    )

    entity_embeddings_pca = pca(embeddings_of_institutions)

    # Perform clustering
    clusters = cluster(embeddings_of_institutions)

    plot_df = pd.DataFrame(
        {
            "embedding_x": entity_embeddings_pca[:, 0],
            "embedding_y": entity_embeddings_pca[:, 1],
            "cluster": "cluster" + pd.Series(clusters).astype(str),
        }
    )

    # Toast message
    gr.Info("PCA and clustering done!")
    return plot_df


def click_on_institution(table, embeddings_var, evt: gr.SelectData):
    institution_id = table["institution"][evt.index[0]]
    try:
        embeddings_df = embeddings_var["embeddings_df"]
        plot_df = pd.DataFrame(
            {
                "institution": table["institution"].values,
                "institution_name": table["institution_name"].values,
                "embedding_x": embeddings_df["embedding_x"].values,
                "embedding_y": embeddings_df["embedding_y"].values,
                "cluster": embeddings_df["cluster"].values,
                # "num_articles": table["num_articles"].values,
            }
        )
        return plot_embeddings(plot_df, institution_id)
    except:
        pass


def click_on_show_plot(table):
    embeddings_df = calculate_emdeddings_and_pca(table)

    plot_df = pd.DataFrame(
        {
            "institution": table["institution"].values,
            "Institution_name": table["institution Name"].values,
            "embedding_x": embeddings_df["embedding_x"].values,
            "embedding_y": embeddings_df["embedding_y"].values,
            "cluster": embeddings_df["cluster"].values,
            # "num_articles": table["num_articles"].values,
        }
    )
    fig = plot_embeddings(plot_df, None)

    return fig, {"embeddings_df": plot_df}


def plot_embeddings(plot_df, institution_id):
    fig = plt.figure(figsize=(12, 12))
    np.random.seed(0)
    # fig.title("{} embeddings".format(parameter).capitalize())
    ax = sns.scatterplot(
        data=plot_df,
        x="embedding_x",
        y="embedding_y",
        hue="cluster",
    )

    row_of_institution = plot_df[plot_df["institution"] == institution_id]
    if not row_of_institution.empty:
        ax.text(
            row_of_institution["embedding_x"],
            row_of_institution["embedding_y"],
            row_of_institution["institution_name"].values[0],
            horizontalalignment="left",
            size="medium",
            color="black",
            weight="normal",
        )
        # Also draw a point for the institution
        ax.scatter(
            row_of_institution["embedding_x"],
            row_of_institution["embedding_y"],
            color="black",
            s=100,
            marker="x",
        )
    # texts = []
    # for i, point in plot_df.iterrows():
    #    if point["institution"] == institution_id:
    #        texts.append(
    #            fig.text(
    #                point["embedding_x"] + 0.02,
    #                point["embedding_y"] + 0.01,
    #                str(point["institution_name"]),
    #            )
    #        )
    # adjust_text(texts)
    return fig


def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.SelectData):
    """
    Get the authors of an institution
    """
    institution = institutions_table["institution"][0]
    number_of_row = evt.index[0]
    institution = institutions_table["institution"][number_of_row]
    concepts = separate_concepts(concept_chooser)
    results_dfs = []
    for concept in concepts:
        # Create a dataframe of the authors and the number of articles they have written for each concept
        result = g.query(
            f"""SELECT ?author ?name (COUNT (?article) AS ?num_articles)
            WHERE {{
                    ?author a <urn:acmcmc:unis:Author> .
                    ?author <urn:acmcmc:unis:name> ?name .
                    ?article <urn:acmcmc:unis:written_in_institution> <{institution}> .
                    ?article <urn:acmcmc:unis:has_author> ?author .
                    ?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
            }}
            GROUP BY ?author ?name
            ORDER BY DESC(COUNT (?article))
            """
        )
        result_df = pd.DataFrame(result)
        result_df.columns = ["author", "name", "num_articles"]
        results_dfs.append(result_df)
    # Now, aggregate the results into a single dataframe by summing the number of articles
    results_df = pd.concat(results_dfs)
    results_df = results_df.groupby(["author", "name"]).sum().reset_index()
    # Sort by number of articles
    results_df = results_df.sort_values(by=["num_articles"], ascending=False)
    return results_df, gr.update(visible=True)


# %%
theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia")

with gr.Blocks(theme=theme) as demo:
    embeddings_df = gr.State({})
    # App title and description
    title = gr.Markdown(
        """
        # Universities Explorer
        This app allows you to explore the institutions more closely related to a concept.

        It uses embeddings of institutions and concepts to calculate the similarity between them. The embedding model, [ComplEx](https://doi.org/10.48550/arXiv.1606.06357), was trained using the [AmpliGraph](https://github.com/Accenture/AmpliGraph) library. The data comes from the [OpenAlex](https://openalex.org/) dataset, which contains information about scientific articles, authors, institutions, and concepts.
        """
    )
    with gr.Group() as institution_search:
        concept_chooser = gr.Textbox(
            label="Concept URI",
            info="Using OpenAlex, find the URI of the concept you want to search for. For example, the URI of the concept 'Knowledge Graph' is https://openalex.org/C2987255567, while the URI of the concept 'Natural Language Processing' is https://openalex.org/C204321447. You can find the URI of a concept by searching for it on OpenAlex and copying the URL from the address bar. You can also search for multiple concepts by separating them with a comma.",
            placeholder="https://openalex.org/C2987255567, https://openalex.org/C204321447",
            value="https://openalex.org/C2987255567, https://openalex.org/C204321447",
        )
        concept_name_label = gr.Markdown("Concept name: ", visible=False)
        # Table for name of institution and similarity to concept
        btn_search_institutions = gr.Button("Search institutions", variant="primary")
        table = gr.Dataframe(
            interactive=False, visible=False, elem_classes="institutions", wrap=True
        )
        btn_search_institutions.click(
            lambda: gr.update(visible=True), outputs=[table], queue=True
        )

    btn_plot_embeddings = gr.Button(
        "Plot embeddings", variant="primary", visible=False, elem_classes="embeddings"
    )
    # Description of what plot embeddings does
    plot_embeddings_info = gr.Markdown(
        """
        This button will plot the embeddings of the institutions related to the concept. The embeddings are calculated using the trained model and then reduced to 2 dimensions using PCA. The institutions are then clustered using KMeans.
        
        Running this may take a while, as we need to calculate the embeddings for all institutions and then perform PCA and clustering.
        """,
        visible=False,
    )
    btn_search_institutions.click(
        process_user_input_concept,
        inputs=[concept_chooser],
        outputs=[
            table,
            btn_plot_embeddings,
            plot_embeddings_info,
            #concept_name_label,
            #concept_name_label,
        ],
        queue=True,
    )
    plot = gr.Plot(visible=False, elem_classes="embeddings")
    btn_plot_embeddings.click(
        lambda: gr.update(visible=True), outputs=[plot], queue=True
    )
    btn_plot_embeddings.click(
        click_on_show_plot,
        inputs=[table],
        outputs=[plot, embeddings_df],
        queue=True,
    )

    # When the user selects a row in the table, get the authors of that institution and display them in a dataframe
    with gr.Group(visible=False, elem_classes="authors") as authors:
        table_authors = gr.Dataframe(
            interactive=False, label="Authors in institution writing about concept"
        )
        table.select(
            get_authors_of_institution,
            inputs=[table, concept_chooser],
            outputs=[table_authors],
        )
        table.select(
            click_on_institution,
            inputs=[table, embeddings_df],
            outputs=[plot],
        )

    btn_clear = gr.ClearButton(components=[table, plot, table_authors])

    # Author information
    author_info = gr.Markdown(
        """
        This demo has been built by [Aldan Creo](
        https://acmc-website.web.app/).
        """
    )

demo.queue()
demo.launch()