# %% import gradio as gr import matplotlib.pyplot as plt import numpy as np import pandas as pd import rdflib import seaborn as sns import tensorflow as tf from adjustText import adjust_text from ampligraph.latent_features import ScoringBasedEmbeddingModel from ampligraph.utils import restore_model from sklearn.cluster import KMeans from sklearn.decomposition import PCA import logging logger = logging.getLogger(__name__) # Start timer, count time to load graph start_time = tf.timestamp() g = rdflib.Graph() uri = "urn:acmcmc:unis:" unis = rdflib.Namespace(uri) g.bind("unis", unis) g.parse("universities.ttl", format="turtle") # End timer end_time = tf.timestamp() logger.info("Graph loaded in {} seconds".format(end_time - start_time)) # model = restore_model("model.pkl") # Start timer, count time to load model start_time = tf.timestamp() model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx") model.load_metadata("model/model") model.build_full_model() super(ScoringBasedEmbeddingModel, model).load_weights("model/") # End timer end_time = tf.timestamp() logger.info("Model loaded in {} seconds".format(end_time - start_time)) def separate_concepts(concepts): concept_list = concepts.split(",") # Trim the strings concept_list = [x.strip() for x in concept_list] return concept_list def pca(embeddings): pca = PCA(n_components=2) pca.fit(embeddings) entity_embeddings_pca = pca.transform(embeddings) return entity_embeddings_pca def cluster(embeddings): clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0) clusters = clustering_algorithm.fit_predict(embeddings) return clusters def get_concept_name(concept_uri): """ Get the name of the concept from the URI """ results = g.query( f"""SELECT ?name WHERE {{ <{concept_uri}> ?name . }}""" ) return pd.DataFrame(results)[0][0] def get_similarities_to_node(array_of_triples, model): """ Calculate the similarity between the embeddings of a node and a list of other nodes """ # Cosine similarity using tensorflow indexes = model.get_indexes(array_of_triples) scores = model(indexes) return scores def process_user_input_concept(concept_chooser): """ The user input is the URI of the concept. Get the similarites between the concept and the institutions """ all_ids_institutions = np.loadtxt( "institutions.csv", delimiter=",", skiprows=1, dtype=str, quotechar='"' ) # Remove duplicates based on the first column all_ids_institutions = all_ids_institutions[ ~pd.DataFrame(all_ids_institutions).duplicated(0) ] chosen_concepts = separate_concepts(concept_chooser) chosen_concepts_names = [get_concept_name(concept) for concept in chosen_concepts] all_similarities = {} for concept in chosen_concepts: s = all_ids_institutions[:, 0] p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s)) o = np.array([concept] * len(s)) array_of_triples = np.array([s, p, o]).T scores = get_similarities_to_node(array_of_triples, model) all_similarities[concept] = scores # Now, average the similarities scores = np.stack(list(all_similarities.values()), axis=0) scores = np.mean(all_similarities, axis=0) table_df = pd.DataFrame( { "Institution": s, "Mean similarity": scores.flatten(), "Institution name": all_ids_institutions[:, 1], # "num_articles": all_ids_institutions[:, 2].astype(int), } ) # Add the individual similarities for i, concept in enumerate(chosen_concepts): table_df[f"Similarity to {chosen_concepts_names[i]}"] = all_similarities[concept] # Reorder the columns so that the mean similarity is after the individual similarities and before the institution name table_df = table_df[ ["Institution"] + [f"Similarity to {chosen_concepts_names[i]}" for i in range(len(chosen_concepts))] + ["Mean similarity", "Institution name"] ] # Sort by mean similarity table_df = table_df.sort_values(by=["Mean similarity"], ascending=False) concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts] return ( table_df, gr.update(visible=True), gr.update(visible=True), #gr.update(visible=True), #f'Concept names: {", ".join(concept_names)}', ) def calculate_emdeddings_and_pca(table): gr.Info("Performing PCA and clustering...") # Perform PCA embeddings_of_institutions = model.get_embeddings( entities=np.array(table["Institution"]) ) entity_embeddings_pca = pca(embeddings_of_institutions) # Perform clustering clusters = cluster(embeddings_of_institutions) plot_df = pd.DataFrame( { "Embedding (coord 1)": entity_embeddings_pca[:, 0], "Embedding (coord 2)": entity_embeddings_pca[:, 1], "Cluster": "Cluster" + pd.Series(clusters).astype(str), } ) # Toast message gr.Info("PCA and clustering done!") return plot_df def click_on_institution(table, embeddings_var, evt: gr.SelectData): institution_id = table["Institution"][evt.index[0]] try: embeddings_df = embeddings_var["embeddings_df"] plot_df = pd.DataFrame( { "Institution": table["Institution"].values, "Institution name": table["Institution name"].values, "Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values, "Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values, "Cluster": embeddings_df["Cluster"].values, # "num_articles": table["num_articles"].values, } ) return plot_embeddings(plot_df, institution_id) except: pass def click_on_show_plot(table): embeddings_df = calculate_emdeddings_and_pca(table) plot_df = pd.DataFrame( { "Institution": table["Institution"].values, "Institution_name": table["Institution Name"].values, "Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values, "Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values, "Cluster": embeddings_df["Cluster"].values, # "num_articles": table["num_articles"].values, } ) fig = plot_embeddings(plot_df, None) return fig, {"embeddings_df": plot_df} def plot_embeddings(plot_df, institution_id): fig = plt.figure(figsize=(12, 12)) np.random.seed(0) # fig.title("{} embeddings".format(parameter).capitalize()) ax = sns.scatterplot( data=plot_df, x="Embedding (coord 1)", y="Embedding (coord 2)", hue="Cluster", ) row_of_institution = plot_df[plot_df["Institution"] == institution_id] if not row_of_institution.empty: ax.text( row_of_institution["Embedding (coord 1)"], row_of_institution["Embedding (coord 2)"], row_of_institution["Institution name"].values[0], horizontalalignment="left", size="medium", color="black", weight="normal", ) # Also draw a point for the institution ax.scatter( row_of_institution["Embedding (coord 1)"], row_of_institution["Embedding (coord 2)"], color="black", s=100, marker="x", ) # texts = [] # for i, point in plot_df.iterrows(): # if point["Institution"] == institution_id: # texts.append( # fig.text( # point["Embedding (coord 1)"] + 0.02, # point["Embedding (coord 2)"] + 0.01, # str(point["Institution name"]), # ) # ) # adjust_text(texts) return fig def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.SelectData): """ Get the authors of an institution """ institution = institutions_table["Institution"][0] number_of_row = evt.index[0] institution = institutions_table["Institution"][number_of_row] concepts = separate_concepts(concept_chooser) results_dfs = [] for concept in concepts: # Create a dataframe of the authors and the number of articles they have written for each concept result = g.query( f"""SELECT ?author ?name (COUNT (?article) AS ?num_articles) WHERE {{ ?author a . ?author ?name . ?article <{Institution}> . ?article ?author . ?article <{concept}> . }} GROUP BY ?author ?name ORDER BY DESC(COUNT (?article)) """ ) result_df = pd.DataFrame(result) result_df.columns = ["author", "name", "num_articles"] results_dfs.append(result_df) # Now, aggregate the results into a single dataframe by summing the number of articles results_df = pd.concat(results_dfs) results_df = results_df.groupby(["author", "name"]).sum().reset_index() # Sort by number of articles results_df = results_df.sort_values(by=["num_articles"], ascending=False) return results_df, gr.update(visible=True) # %% theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia") with gr.Blocks(theme=theme) as demo: embeddings_df = gr.State({}) # App title and description title = gr.Markdown( """ # Universities Explorer This app allows you to explore the institutions more closely related to a concept. It uses embeddings of institutions and concepts to calculate the similarity between them. The embedding model, [ComplEx](https://doi.org/10.48550/arXiv.1606.06357), was trained using the [AmpliGraph](https://github.com/Accenture/AmpliGraph) library. The data comes from the [OpenAlex](https://openalex.org/) dataset, which contains information about scientific articles, authors, institutions, and concepts. """ ) with gr.Group() as institution_search: concept_chooser = gr.Textbox( label="Concept URI", info="Using OpenAlex, find the URI of the concept you want to search for. For example, the URI of the concept 'Knowledge Graph' is https://openalex.org/C2987255567, while the URI of the concept 'Natural Language Processing' is https://openalex.org/C204321447. You can find the URI of a concept by searching for it on OpenAlex and copying the URL from the address bar. You can also search for multiple concepts by separating them with a comma.", placeholder="https://openalex.org/C2987255567, https://openalex.org/C204321447", value="https://openalex.org/C2987255567, https://openalex.org/C204321447", ) concept_name_label = gr.Markdown("Concept name: ", visible=False) # Table for name of institution and similarity to concept btn_search_institutions = gr.Button("Search institutions", variant="primary") table = gr.Dataframe( interactive=False, visible=False, elem_classes="institutions", wrap=True ) btn_search_institutions.click( lambda: gr.update(visible=True), outputs=[table], queue=True ) btn_plot_embeddings = gr.Button( "Plot embeddings", variant="primary", visible=False, elem_classes="embeddings" ) # Description of what plot embeddings does plot_embeddings_info = gr.Markdown( """ This button will plot the embeddings of the institutions related to the concept. The embeddings are calculated using the trained model and then reduced to 2 dimensions using PCA. The institutions are then clustered using KMeans. Running this may take a while, as we need to calculate the embeddings for all institutions and then perform PCA and clustering. """, visible=False, ) btn_search_institutions.click( process_user_input_concept, inputs=[concept_chooser], outputs=[ table, btn_plot_embeddings, plot_embeddings_info, #concept_name_label, #concept_name_label, ], queue=True, ) plot = gr.Plot(visible=False, elem_classes="embeddings") btn_plot_embeddings.click( lambda: gr.update(visible=True), outputs=[plot], queue=True ) btn_plot_embeddings.click( click_on_show_plot, inputs=[table], outputs=[plot, embeddings_df], queue=True, ) # When the user selects a row in the table, get the authors of that institution and display them in a dataframe with gr.Group(visible=False, elem_classes="authors") as authors: table_authors = gr.Dataframe( interactive=False, label="Authors in institution writing about concept" ) table.select( get_authors_of_institution, inputs=[table, concept_chooser], outputs=[table_authors], ) table.select( click_on_institution, inputs=[table, embeddings_df], outputs=[plot], ) btn_clear = gr.ClearButton(components=[table, plot, table_authors]) # Author information author_info = gr.Markdown( """ This demo has been built by [Aldan Creo]( https://acmc-website.web.app/). """ ) demo.queue() demo.launch()