Spaces:
Running
Running
# %% | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import rdflib | |
import seaborn as sns | |
import tensorflow as tf | |
from adjustText import adjust_text | |
from ampligraph.latent_features import ScoringBasedEmbeddingModel | |
from ampligraph.utils import restore_model | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
import logging | |
logger = logging.getLogger(__name__) | |
# Start timer, count time to load graph | |
start_time = tf.timestamp() | |
g = rdflib.Graph() | |
uri = "urn:acmcmc:unis:" | |
unis = rdflib.Namespace(uri) | |
g.bind("unis", unis) | |
g.parse("universities.ttl", format="turtle") | |
# End timer | |
end_time = tf.timestamp() | |
logger.info("Graph loaded in {} seconds".format(end_time - start_time)) | |
# model = restore_model("model.pkl") | |
# Start timer, count time to load model | |
start_time = tf.timestamp() | |
model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx") | |
model.load_metadata("model/model") | |
model.build_full_model() | |
super(ScoringBasedEmbeddingModel, model).load_weights("model/") | |
# End timer | |
end_time = tf.timestamp() | |
logger.info("Model loaded in {} seconds".format(end_time - start_time)) | |
def separate_concepts(concepts): | |
concept_list = concepts.split(",") | |
# Trim the strings | |
concept_list = [x.strip() for x in concept_list] | |
return concept_list | |
def pca(embeddings): | |
pca = PCA(n_components=2) | |
pca.fit(embeddings) | |
entity_embeddings_pca = pca.transform(embeddings) | |
return entity_embeddings_pca | |
def cluster(embeddings): | |
clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0) | |
clusters = clustering_algorithm.fit_predict(embeddings) | |
return clusters | |
def get_concept_name(concept_uri): | |
""" | |
Get the name of the concept from the URI | |
""" | |
results = g.query( | |
f"""SELECT ?name | |
WHERE {{ | |
<{concept_uri}> <urn:acmcmc:unis:name> ?name . | |
}}""" | |
) | |
return pd.DataFrame(results)[0][0] | |
def get_similarities_to_node(array_of_triples, model): | |
""" | |
Calculate the similarity between the embeddings of a node and a list of other nodes | |
""" | |
# Cosine similarity using tensorflow | |
indexes = model.get_indexes(array_of_triples) | |
scores = model(indexes) | |
return scores | |
def process_user_input_concept(concept_chooser): | |
""" | |
The user input is the URI of the concept. Get the similarites between the concept and the institutions | |
""" | |
all_ids_institutions = np.loadtxt( | |
"institutions.csv", delimiter=",", skiprows=1, dtype=str, quotechar='"' | |
) | |
# Remove duplicates based on the first column | |
all_ids_institutions = all_ids_institutions[ | |
~pd.DataFrame(all_ids_institutions).duplicated(0) | |
] | |
chosen_concepts = separate_concepts(concept_chooser) | |
chosen_concepts_names = [get_concept_name(concept) for concept in chosen_concepts] | |
all_similarities = {} | |
for concept in chosen_concepts: | |
s = all_ids_institutions[:, 0] | |
p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s)) | |
o = np.array([concept] * len(s)) | |
array_of_triples = np.array([s, p, o]).T | |
scores = get_similarities_to_node(array_of_triples, model) | |
all_similarities[concept] = scores | |
# Now, average the similarities | |
scores = np.stack(list(all_similarities.values()), axis=0) | |
scores = np.mean(all_similarities, axis=0) | |
table_df = pd.DataFrame( | |
{ | |
"Institution": s, | |
"Mean similarity": scores.flatten(), | |
"Institution name": all_ids_institutions[:, 1], | |
# "num_articles": all_ids_institutions[:, 2].astype(int), | |
} | |
) | |
# Add the individual similarities | |
for i, concept in enumerate(chosen_concepts): | |
table_df[f"Similarity to {chosen_concepts_names[i]}"] = all_similarities[concept] | |
# Reorder the columns so that the mean similarity is after the individual similarities and before the institution name | |
table_df = table_df[ | |
["Institution"] | |
+ [f"Similarity to {chosen_concepts_names[i]}" for i in range(len(chosen_concepts))] | |
+ ["Mean similarity", "Institution name"] | |
] | |
# Sort by mean similarity | |
table_df = table_df.sort_values(by=["Mean similarity"], ascending=False) | |
concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts] | |
return ( | |
table_df, | |
gr.update(visible=True), | |
gr.update(visible=True), | |
#gr.update(visible=True), | |
#f'Concept names: {", ".join(concept_names)}', | |
) | |
def calculate_emdeddings_and_pca(table): | |
gr.Info("Performing PCA and clustering...") | |
# Perform PCA | |
embeddings_of_institutions = model.get_embeddings( | |
entities=np.array(table["Institution"]) | |
) | |
entity_embeddings_pca = pca(embeddings_of_institutions) | |
# Perform clustering | |
clusters = cluster(embeddings_of_institutions) | |
plot_df = pd.DataFrame( | |
{ | |
"Embedding (coord 1)": entity_embeddings_pca[:, 0], | |
"Embedding (coord 2)": entity_embeddings_pca[:, 1], | |
"Cluster": "Cluster" + pd.Series(clusters).astype(str), | |
} | |
) | |
# Toast message | |
gr.Info("PCA and clustering done!") | |
return plot_df | |
def click_on_institution(table, embeddings_var, evt: gr.SelectData): | |
institution_id = table["Institution"][evt.index[0]] | |
try: | |
embeddings_df = embeddings_var["embeddings_df"] | |
plot_df = pd.DataFrame( | |
{ | |
"Institution": table["Institution"].values, | |
"Institution name": table["Institution name"].values, | |
"Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values, | |
"Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values, | |
"Cluster": embeddings_df["Cluster"].values, | |
# "num_articles": table["num_articles"].values, | |
} | |
) | |
return plot_embeddings(plot_df, institution_id) | |
except: | |
pass | |
def click_on_show_plot(table): | |
embeddings_df = calculate_emdeddings_and_pca(table) | |
plot_df = pd.DataFrame( | |
{ | |
"Institution": table["Institution"].values, | |
"Institution_name": table["Institution Name"].values, | |
"Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values, | |
"Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values, | |
"Cluster": embeddings_df["Cluster"].values, | |
# "num_articles": table["num_articles"].values, | |
} | |
) | |
fig = plot_embeddings(plot_df, None) | |
return fig, {"embeddings_df": plot_df} | |
def plot_embeddings(plot_df, institution_id): | |
fig = plt.figure(figsize=(12, 12)) | |
np.random.seed(0) | |
# fig.title("{} embeddings".format(parameter).capitalize()) | |
ax = sns.scatterplot( | |
data=plot_df, | |
x="Embedding (coord 1)", | |
y="Embedding (coord 2)", | |
hue="Cluster", | |
) | |
row_of_institution = plot_df[plot_df["Institution"] == institution_id] | |
if not row_of_institution.empty: | |
ax.text( | |
row_of_institution["Embedding (coord 1)"], | |
row_of_institution["Embedding (coord 2)"], | |
row_of_institution["Institution name"].values[0], | |
horizontalalignment="left", | |
size="medium", | |
color="black", | |
weight="normal", | |
) | |
# Also draw a point for the institution | |
ax.scatter( | |
row_of_institution["Embedding (coord 1)"], | |
row_of_institution["Embedding (coord 2)"], | |
color="black", | |
s=100, | |
marker="x", | |
) | |
# texts = [] | |
# for i, point in plot_df.iterrows(): | |
# if point["Institution"] == institution_id: | |
# texts.append( | |
# fig.text( | |
# point["Embedding (coord 1)"] + 0.02, | |
# point["Embedding (coord 2)"] + 0.01, | |
# str(point["Institution name"]), | |
# ) | |
# ) | |
# adjust_text(texts) | |
return fig | |
def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.SelectData): | |
""" | |
Get the authors of an institution | |
""" | |
institution = institutions_table["Institution"][0] | |
number_of_row = evt.index[0] | |
institution = institutions_table["Institution"][number_of_row] | |
concepts = separate_concepts(concept_chooser) | |
results_dfs = [] | |
for concept in concepts: | |
# Create a dataframe of the authors and the number of articles they have written for each concept | |
result = g.query( | |
f"""SELECT ?author ?name (COUNT (?article) AS ?num_articles) | |
WHERE {{ | |
?author a <urn:acmcmc:unis:Author> . | |
?author <urn:acmcmc:unis:name> ?name . | |
?article <urn:acmcmc:unis:written_in_institution> <{Institution}> . | |
?article <urn:acmcmc:unis:has_author> ?author . | |
?article <urn:acmcmc:unis:related_to_concept> <{concept}> . | |
}} | |
GROUP BY ?author ?name | |
ORDER BY DESC(COUNT (?article)) | |
""" | |
) | |
result_df = pd.DataFrame(result) | |
result_df.columns = ["author", "name", "num_articles"] | |
results_dfs.append(result_df) | |
# Now, aggregate the results into a single dataframe by summing the number of articles | |
results_df = pd.concat(results_dfs) | |
results_df = results_df.groupby(["author", "name"]).sum().reset_index() | |
# Sort by number of articles | |
results_df = results_df.sort_values(by=["num_articles"], ascending=False) | |
return results_df, gr.update(visible=True) | |
# %% | |
theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia") | |
with gr.Blocks(theme=theme) as demo: | |
embeddings_df = gr.State({}) | |
# App title and description | |
title = gr.Markdown( | |
""" | |
# Universities Explorer | |
This app allows you to explore the institutions more closely related to a concept. | |
It uses embeddings of institutions and concepts to calculate the similarity between them. The embedding model, [ComplEx](https://doi.org/10.48550/arXiv.1606.06357), was trained using the [AmpliGraph](https://github.com/Accenture/AmpliGraph) library. The data comes from the [OpenAlex](https://openalex.org/) dataset, which contains information about scientific articles, authors, institutions, and concepts. | |
""" | |
) | |
with gr.Group() as institution_search: | |
concept_chooser = gr.Textbox( | |
label="Concept URI", | |
info="Using OpenAlex, find the URI of the concept you want to search for. For example, the URI of the concept 'Knowledge Graph' is https://openalex.org/C2987255567, while the URI of the concept 'Natural Language Processing' is https://openalex.org/C204321447. You can find the URI of a concept by searching for it on OpenAlex and copying the URL from the address bar. You can also search for multiple concepts by separating them with a comma.", | |
placeholder="https://openalex.org/C2987255567, https://openalex.org/C204321447", | |
value="https://openalex.org/C2987255567, https://openalex.org/C204321447", | |
) | |
concept_name_label = gr.Markdown("Concept name: ", visible=False) | |
# Table for name of institution and similarity to concept | |
btn_search_institutions = gr.Button("Search institutions", variant="primary") | |
table = gr.Dataframe( | |
interactive=False, visible=False, elem_classes="institutions", wrap=True | |
) | |
btn_search_institutions.click( | |
lambda: gr.update(visible=True), outputs=[table], queue=True | |
) | |
btn_plot_embeddings = gr.Button( | |
"Plot embeddings", variant="primary", visible=False, elem_classes="embeddings" | |
) | |
# Description of what plot embeddings does | |
plot_embeddings_info = gr.Markdown( | |
""" | |
This button will plot the embeddings of the institutions related to the concept. The embeddings are calculated using the trained model and then reduced to 2 dimensions using PCA. The institutions are then clustered using KMeans. | |
Running this may take a while, as we need to calculate the embeddings for all institutions and then perform PCA and clustering. | |
""", | |
visible=False, | |
) | |
btn_search_institutions.click( | |
process_user_input_concept, | |
inputs=[concept_chooser], | |
outputs=[ | |
table, | |
btn_plot_embeddings, | |
plot_embeddings_info, | |
#concept_name_label, | |
#concept_name_label, | |
], | |
queue=True, | |
) | |
plot = gr.Plot(visible=False, elem_classes="embeddings") | |
btn_plot_embeddings.click( | |
lambda: gr.update(visible=True), outputs=[plot], queue=True | |
) | |
btn_plot_embeddings.click( | |
click_on_show_plot, | |
inputs=[table], | |
outputs=[plot, embeddings_df], | |
queue=True, | |
) | |
# When the user selects a row in the table, get the authors of that institution and display them in a dataframe | |
with gr.Group(visible=False, elem_classes="authors") as authors: | |
table_authors = gr.Dataframe( | |
interactive=False, label="Authors in institution writing about concept" | |
) | |
table.select( | |
get_authors_of_institution, | |
inputs=[table, concept_chooser], | |
outputs=[table_authors], | |
) | |
table.select( | |
click_on_institution, | |
inputs=[table, embeddings_df], | |
outputs=[plot], | |
) | |
btn_clear = gr.ClearButton(components=[table, plot, table_authors]) | |
# Author information | |
author_info = gr.Markdown( | |
""" | |
This demo has been built by [Aldan Creo]( | |
https://acmc-website.web.app/). | |
""" | |
) | |
demo.queue() | |
demo.launch() | |