acmc
new model
36c5b68
raw
history blame
13.9 kB
# %%
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rdflib
import seaborn as sns
import tensorflow as tf
from adjustText import adjust_text
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.utils import restore_model
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import logging
logger = logging.getLogger(__name__)
# Start timer, count time to load graph
start_time = tf.timestamp()
g = rdflib.Graph()
uri = "urn:acmcmc:unis:"
unis = rdflib.Namespace(uri)
g.bind("unis", unis)
g.parse("universities.ttl", format="turtle")
# End timer
end_time = tf.timestamp()
logger.info("Graph loaded in {} seconds".format(end_time - start_time))
# model = restore_model("model.pkl")
# Start timer, count time to load model
start_time = tf.timestamp()
model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx")
model.load_metadata("model/model")
model.build_full_model()
super(ScoringBasedEmbeddingModel, model).load_weights("model/")
# End timer
end_time = tf.timestamp()
logger.info("Model loaded in {} seconds".format(end_time - start_time))
def separate_concepts(concepts):
concept_list = concepts.split(",")
# Trim the strings
concept_list = [x.strip() for x in concept_list]
return concept_list
def pca(embeddings):
pca = PCA(n_components=2)
pca.fit(embeddings)
entity_embeddings_pca = pca.transform(embeddings)
return entity_embeddings_pca
def cluster(embeddings):
clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
clusters = clustering_algorithm.fit_predict(embeddings)
return clusters
def get_concept_name(concept_uri):
"""
Get the name of the concept from the URI
"""
results = g.query(
f"""SELECT ?name
WHERE {{
<{concept_uri}> <urn:acmcmc:unis:name> ?name .
}}"""
)
return pd.DataFrame(results)[0][0]
def get_similarities_to_node(array_of_triples, model):
"""
Calculate the similarity between the embeddings of a node and a list of other nodes
"""
# Cosine similarity using tensorflow
indexes = model.get_indexes(array_of_triples)
scores = model(indexes)
return scores
def process_user_input_concept(concept_chooser):
"""
The user input is the URI of the concept. Get the similarites between the concept and the institutions
"""
all_ids_institutions = np.loadtxt(
"institutions.csv", delimiter=",", skiprows=1, dtype=str, quotechar='"'
)
# Remove duplicates based on the first column
all_ids_institutions = all_ids_institutions[
~pd.DataFrame(all_ids_institutions).duplicated(0)
]
chosen_concepts = separate_concepts(concept_chooser)
chosen_concepts_names = [get_concept_name(concept) for concept in chosen_concepts]
all_similarities = {}
for concept in chosen_concepts:
s = all_ids_institutions[:, 0]
p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s))
o = np.array([concept] * len(s))
array_of_triples = np.array([s, p, o]).T
scores = get_similarities_to_node(array_of_triples, model)
all_similarities[concept] = scores
# Now, average the similarities
scores = np.stack(list(all_similarities.values()), axis=0)
scores = np.mean(all_similarities, axis=0)
table_df = pd.DataFrame(
{
"Institution": s,
"Mean similarity": scores.flatten(),
"Institution name": all_ids_institutions[:, 1],
# "num_articles": all_ids_institutions[:, 2].astype(int),
}
)
# Add the individual similarities
for i, concept in enumerate(chosen_concepts):
table_df[f"Similarity to {chosen_concepts_names[i]}"] = all_similarities[concept]
# Reorder the columns so that the mean similarity is after the individual similarities and before the institution name
table_df = table_df[
["Institution"]
+ [f"Similarity to {chosen_concepts_names[i]}" for i in range(len(chosen_concepts))]
+ ["Mean similarity", "Institution name"]
]
# Sort by mean similarity
table_df = table_df.sort_values(by=["Mean similarity"], ascending=False)
concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
return (
table_df,
gr.update(visible=True),
gr.update(visible=True),
#gr.update(visible=True),
#f'Concept names: {", ".join(concept_names)}',
)
def calculate_emdeddings_and_pca(table):
gr.Info("Performing PCA and clustering...")
# Perform PCA
embeddings_of_institutions = model.get_embeddings(
entities=np.array(table["Institution"])
)
entity_embeddings_pca = pca(embeddings_of_institutions)
# Perform clustering
clusters = cluster(embeddings_of_institutions)
plot_df = pd.DataFrame(
{
"Embedding (coord 1)": entity_embeddings_pca[:, 0],
"Embedding (coord 2)": entity_embeddings_pca[:, 1],
"Cluster": "Cluster" + pd.Series(clusters).astype(str),
}
)
# Toast message
gr.Info("PCA and clustering done!")
return plot_df
def click_on_institution(table, embeddings_var, evt: gr.SelectData):
institution_id = table["Institution"][evt.index[0]]
try:
embeddings_df = embeddings_var["embeddings_df"]
plot_df = pd.DataFrame(
{
"Institution": table["Institution"].values,
"Institution name": table["Institution name"].values,
"Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values,
"Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values,
"Cluster": embeddings_df["Cluster"].values,
# "num_articles": table["num_articles"].values,
}
)
return plot_embeddings(plot_df, institution_id)
except:
pass
def click_on_show_plot(table):
embeddings_df = calculate_emdeddings_and_pca(table)
plot_df = pd.DataFrame(
{
"Institution": table["Institution"].values,
"Institution_name": table["Institution Name"].values,
"Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values,
"Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values,
"Cluster": embeddings_df["Cluster"].values,
# "num_articles": table["num_articles"].values,
}
)
fig = plot_embeddings(plot_df, None)
return fig, {"embeddings_df": plot_df}
def plot_embeddings(plot_df, institution_id):
fig = plt.figure(figsize=(12, 12))
np.random.seed(0)
# fig.title("{} embeddings".format(parameter).capitalize())
ax = sns.scatterplot(
data=plot_df,
x="Embedding (coord 1)",
y="Embedding (coord 2)",
hue="Cluster",
)
row_of_institution = plot_df[plot_df["Institution"] == institution_id]
if not row_of_institution.empty:
ax.text(
row_of_institution["Embedding (coord 1)"],
row_of_institution["Embedding (coord 2)"],
row_of_institution["Institution name"].values[0],
horizontalalignment="left",
size="medium",
color="black",
weight="normal",
)
# Also draw a point for the institution
ax.scatter(
row_of_institution["Embedding (coord 1)"],
row_of_institution["Embedding (coord 2)"],
color="black",
s=100,
marker="x",
)
# texts = []
# for i, point in plot_df.iterrows():
# if point["Institution"] == institution_id:
# texts.append(
# fig.text(
# point["Embedding (coord 1)"] + 0.02,
# point["Embedding (coord 2)"] + 0.01,
# str(point["Institution name"]),
# )
# )
# adjust_text(texts)
return fig
def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.SelectData):
"""
Get the authors of an institution
"""
institution = institutions_table["Institution"][0]
number_of_row = evt.index[0]
institution = institutions_table["Institution"][number_of_row]
concepts = separate_concepts(concept_chooser)
results_dfs = []
for concept in concepts:
# Create a dataframe of the authors and the number of articles they have written for each concept
result = g.query(
f"""SELECT ?author ?name (COUNT (?article) AS ?num_articles)
WHERE {{
?author a <urn:acmcmc:unis:Author> .
?author <urn:acmcmc:unis:name> ?name .
?article <urn:acmcmc:unis:written_in_institution> <{Institution}> .
?article <urn:acmcmc:unis:has_author> ?author .
?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
}}
GROUP BY ?author ?name
ORDER BY DESC(COUNT (?article))
"""
)
result_df = pd.DataFrame(result)
result_df.columns = ["author", "name", "num_articles"]
results_dfs.append(result_df)
# Now, aggregate the results into a single dataframe by summing the number of articles
results_df = pd.concat(results_dfs)
results_df = results_df.groupby(["author", "name"]).sum().reset_index()
# Sort by number of articles
results_df = results_df.sort_values(by=["num_articles"], ascending=False)
return results_df, gr.update(visible=True)
# %%
theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia")
with gr.Blocks(theme=theme) as demo:
embeddings_df = gr.State({})
# App title and description
title = gr.Markdown(
"""
# Universities Explorer
This app allows you to explore the institutions more closely related to a concept.
It uses embeddings of institutions and concepts to calculate the similarity between them. The embedding model, [ComplEx](https://doi.org/10.48550/arXiv.1606.06357), was trained using the [AmpliGraph](https://github.com/Accenture/AmpliGraph) library. The data comes from the [OpenAlex](https://openalex.org/) dataset, which contains information about scientific articles, authors, institutions, and concepts.
"""
)
with gr.Group() as institution_search:
concept_chooser = gr.Textbox(
label="Concept URI",
info="Using OpenAlex, find the URI of the concept you want to search for. For example, the URI of the concept 'Knowledge Graph' is https://openalex.org/C2987255567, while the URI of the concept 'Natural Language Processing' is https://openalex.org/C204321447. You can find the URI of a concept by searching for it on OpenAlex and copying the URL from the address bar. You can also search for multiple concepts by separating them with a comma.",
placeholder="https://openalex.org/C2987255567, https://openalex.org/C204321447",
value="https://openalex.org/C2987255567, https://openalex.org/C204321447",
)
concept_name_label = gr.Markdown("Concept name: ", visible=False)
# Table for name of institution and similarity to concept
btn_search_institutions = gr.Button("Search institutions", variant="primary")
table = gr.Dataframe(
interactive=False, visible=False, elem_classes="institutions", wrap=True
)
btn_search_institutions.click(
lambda: gr.update(visible=True), outputs=[table], queue=True
)
btn_plot_embeddings = gr.Button(
"Plot embeddings", variant="primary", visible=False, elem_classes="embeddings"
)
# Description of what plot embeddings does
plot_embeddings_info = gr.Markdown(
"""
This button will plot the embeddings of the institutions related to the concept. The embeddings are calculated using the trained model and then reduced to 2 dimensions using PCA. The institutions are then clustered using KMeans.
Running this may take a while, as we need to calculate the embeddings for all institutions and then perform PCA and clustering.
""",
visible=False,
)
btn_search_institutions.click(
process_user_input_concept,
inputs=[concept_chooser],
outputs=[
table,
btn_plot_embeddings,
plot_embeddings_info,
#concept_name_label,
#concept_name_label,
],
queue=True,
)
plot = gr.Plot(visible=False, elem_classes="embeddings")
btn_plot_embeddings.click(
lambda: gr.update(visible=True), outputs=[plot], queue=True
)
btn_plot_embeddings.click(
click_on_show_plot,
inputs=[table],
outputs=[plot, embeddings_df],
queue=True,
)
# When the user selects a row in the table, get the authors of that institution and display them in a dataframe
with gr.Group(visible=False, elem_classes="authors") as authors:
table_authors = gr.Dataframe(
interactive=False, label="Authors in institution writing about concept"
)
table.select(
get_authors_of_institution,
inputs=[table, concept_chooser],
outputs=[table_authors],
)
table.select(
click_on_institution,
inputs=[table, embeddings_df],
outputs=[plot],
)
btn_clear = gr.ClearButton(components=[table, plot, table_authors])
# Author information
author_info = gr.Markdown(
"""
This demo has been built by [Aldan Creo](
https://acmc-website.web.app/).
"""
)
demo.queue()
demo.launch()