acmc commited on
Commit
36c5b68
1 Parent(s): cdd672b
app.py CHANGED
@@ -98,7 +98,8 @@ def process_user_input_concept(concept_chooser):
98
  ]
99
 
100
  chosen_concepts = separate_concepts(concept_chooser)
101
- all_similarities = []
 
102
  for concept in chosen_concepts:
103
  s = all_ids_institutions[:, 0]
104
  p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s))
@@ -107,29 +108,42 @@ def process_user_input_concept(concept_chooser):
107
  array_of_triples = np.array([s, p, o]).T
108
 
109
  scores = get_similarities_to_node(array_of_triples, model)
110
- all_similarities.append(scores)
111
 
112
  # Now, average the similarities
113
- scores = np.stack(all_similarities, axis=0)
114
  scores = np.mean(all_similarities, axis=0)
115
 
116
  table_df = pd.DataFrame(
117
  {
118
- "institution": s,
119
- "similarity": scores.flatten(),
120
- "institution_name": all_ids_institutions[:, 1],
121
  # "num_articles": all_ids_institutions[:, 2].astype(int),
122
  }
123
  )
124
- # Sort by number of articles
125
- table_df = table_df.sort_values(by=["similarity"], ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
127
  return (
128
  table_df,
129
  gr.update(visible=True),
130
  gr.update(visible=True),
131
- gr.update(visible=True),
132
- f'Concept names: {", ".join(concept_names)}',
133
  )
134
 
135
 
@@ -137,7 +151,7 @@ def calculate_emdeddings_and_pca(table):
137
  gr.Info("Performing PCA and clustering...")
138
  # Perform PCA
139
  embeddings_of_institutions = model.get_embeddings(
140
- entities=np.array(table["institution"])
141
  )
142
 
143
  entity_embeddings_pca = pca(embeddings_of_institutions)
@@ -147,9 +161,9 @@ def calculate_emdeddings_and_pca(table):
147
 
148
  plot_df = pd.DataFrame(
149
  {
150
- "embedding1": entity_embeddings_pca[:, 0],
151
- "embedding2": entity_embeddings_pca[:, 1],
152
- "cluster": "cluster" + pd.Series(clusters).astype(str),
153
  }
154
  )
155
 
@@ -159,16 +173,16 @@ def calculate_emdeddings_and_pca(table):
159
 
160
 
161
  def click_on_institution(table, embeddings_var, evt: gr.SelectData):
162
- institution_id = table["institution"][evt.index[0]]
163
  try:
164
  embeddings_df = embeddings_var["embeddings_df"]
165
  plot_df = pd.DataFrame(
166
  {
167
- "institution": table["institution"].values,
168
- "institution_name": table["institution_name"].values,
169
- "embedding1": embeddings_df["embedding1"].values,
170
- "embedding2": embeddings_df["embedding2"].values,
171
- "cluster": embeddings_df["cluster"].values,
172
  # "num_articles": table["num_articles"].values,
173
  }
174
  )
@@ -182,11 +196,11 @@ def click_on_show_plot(table):
182
 
183
  plot_df = pd.DataFrame(
184
  {
185
- "institution": table["institution"].values,
186
- "institution_name": table["institution_name"].values,
187
- "embedding1": embeddings_df["embedding1"].values,
188
- "embedding2": embeddings_df["embedding2"].values,
189
- "cluster": embeddings_df["cluster"].values,
190
  # "num_articles": table["num_articles"].values,
191
  }
192
  )
@@ -201,17 +215,17 @@ def plot_embeddings(plot_df, institution_id):
201
  # fig.title("{} embeddings".format(parameter).capitalize())
202
  ax = sns.scatterplot(
203
  data=plot_df,
204
- x="embedding1",
205
- y="embedding2",
206
- hue="cluster",
207
  )
208
 
209
- row_of_institution = plot_df[plot_df["institution"] == institution_id]
210
  if not row_of_institution.empty:
211
  ax.text(
212
- row_of_institution["embedding1"],
213
- row_of_institution["embedding2"],
214
- row_of_institution["institution_name"].values[0],
215
  horizontalalignment="left",
216
  size="medium",
217
  color="black",
@@ -219,20 +233,20 @@ def plot_embeddings(plot_df, institution_id):
219
  )
220
  # Also draw a point for the institution
221
  ax.scatter(
222
- row_of_institution["embedding1"],
223
- row_of_institution["embedding2"],
224
  color="black",
225
  s=100,
226
  marker="x",
227
  )
228
  # texts = []
229
  # for i, point in plot_df.iterrows():
230
- # if point["institution"] == institution_id:
231
  # texts.append(
232
  # fig.text(
233
- # point["embedding1"] + 0.02,
234
- # point["embedding2"] + 0.01,
235
- # str(point["institution_name"]),
236
  # )
237
  # )
238
  # adjust_text(texts)
@@ -243,9 +257,9 @@ def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.Sele
243
  """
244
  Get the authors of an institution
245
  """
246
- institution = institutions_table["institution"][0]
247
  number_of_row = evt.index[0]
248
- institution = institutions_table["institution"][number_of_row]
249
  concepts = separate_concepts(concept_chooser)
250
  results_dfs = []
251
  for concept in concepts:
@@ -255,7 +269,7 @@ def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.Sele
255
  WHERE {{
256
  ?author a <urn:acmcmc:unis:Author> .
257
  ?author <urn:acmcmc:unis:name> ?name .
258
- ?article <urn:acmcmc:unis:written_in_institution> <{institution}> .
259
  ?article <urn:acmcmc:unis:has_author> ?author .
260
  ?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
261
  }}
@@ -324,8 +338,8 @@ with gr.Blocks(theme=theme) as demo:
324
  table,
325
  btn_plot_embeddings,
326
  plot_embeddings_info,
327
- concept_name_label,
328
- concept_name_label,
329
  ],
330
  queue=True,
331
  )
 
98
  ]
99
 
100
  chosen_concepts = separate_concepts(concept_chooser)
101
+ chosen_concepts_names = [get_concept_name(concept) for concept in chosen_concepts]
102
+ all_similarities = {}
103
  for concept in chosen_concepts:
104
  s = all_ids_institutions[:, 0]
105
  p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s))
 
108
  array_of_triples = np.array([s, p, o]).T
109
 
110
  scores = get_similarities_to_node(array_of_triples, model)
111
+ all_similarities[concept] = scores
112
 
113
  # Now, average the similarities
114
+ scores = np.stack(list(all_similarities.values()), axis=0)
115
  scores = np.mean(all_similarities, axis=0)
116
 
117
  table_df = pd.DataFrame(
118
  {
119
+ "Institution": s,
120
+ "Mean similarity": scores.flatten(),
121
+ "Institution name": all_ids_institutions[:, 1],
122
  # "num_articles": all_ids_institutions[:, 2].astype(int),
123
  }
124
  )
125
+
126
+ # Add the individual similarities
127
+ for i, concept in enumerate(chosen_concepts):
128
+ table_df[f"Similarity to {chosen_concepts_names[i]}"] = all_similarities[concept]
129
+
130
+ # Reorder the columns so that the mean similarity is after the individual similarities and before the institution name
131
+ table_df = table_df[
132
+ ["Institution"]
133
+ + [f"Similarity to {chosen_concepts_names[i]}" for i in range(len(chosen_concepts))]
134
+ + ["Mean similarity", "Institution name"]
135
+ ]
136
+
137
+ # Sort by mean similarity
138
+ table_df = table_df.sort_values(by=["Mean similarity"], ascending=False)
139
+
140
  concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
141
  return (
142
  table_df,
143
  gr.update(visible=True),
144
  gr.update(visible=True),
145
+ #gr.update(visible=True),
146
+ #f'Concept names: {", ".join(concept_names)}',
147
  )
148
 
149
 
 
151
  gr.Info("Performing PCA and clustering...")
152
  # Perform PCA
153
  embeddings_of_institutions = model.get_embeddings(
154
+ entities=np.array(table["Institution"])
155
  )
156
 
157
  entity_embeddings_pca = pca(embeddings_of_institutions)
 
161
 
162
  plot_df = pd.DataFrame(
163
  {
164
+ "Embedding (coord 1)": entity_embeddings_pca[:, 0],
165
+ "Embedding (coord 2)": entity_embeddings_pca[:, 1],
166
+ "Cluster": "Cluster" + pd.Series(clusters).astype(str),
167
  }
168
  )
169
 
 
173
 
174
 
175
  def click_on_institution(table, embeddings_var, evt: gr.SelectData):
176
+ institution_id = table["Institution"][evt.index[0]]
177
  try:
178
  embeddings_df = embeddings_var["embeddings_df"]
179
  plot_df = pd.DataFrame(
180
  {
181
+ "Institution": table["Institution"].values,
182
+ "Institution name": table["Institution name"].values,
183
+ "Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values,
184
+ "Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values,
185
+ "Cluster": embeddings_df["Cluster"].values,
186
  # "num_articles": table["num_articles"].values,
187
  }
188
  )
 
196
 
197
  plot_df = pd.DataFrame(
198
  {
199
+ "Institution": table["Institution"].values,
200
+ "Institution_name": table["Institution Name"].values,
201
+ "Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values,
202
+ "Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values,
203
+ "Cluster": embeddings_df["Cluster"].values,
204
  # "num_articles": table["num_articles"].values,
205
  }
206
  )
 
215
  # fig.title("{} embeddings".format(parameter).capitalize())
216
  ax = sns.scatterplot(
217
  data=plot_df,
218
+ x="Embedding (coord 1)",
219
+ y="Embedding (coord 2)",
220
+ hue="Cluster",
221
  )
222
 
223
+ row_of_institution = plot_df[plot_df["Institution"] == institution_id]
224
  if not row_of_institution.empty:
225
  ax.text(
226
+ row_of_institution["Embedding (coord 1)"],
227
+ row_of_institution["Embedding (coord 2)"],
228
+ row_of_institution["Institution name"].values[0],
229
  horizontalalignment="left",
230
  size="medium",
231
  color="black",
 
233
  )
234
  # Also draw a point for the institution
235
  ax.scatter(
236
+ row_of_institution["Embedding (coord 1)"],
237
+ row_of_institution["Embedding (coord 2)"],
238
  color="black",
239
  s=100,
240
  marker="x",
241
  )
242
  # texts = []
243
  # for i, point in plot_df.iterrows():
244
+ # if point["Institution"] == institution_id:
245
  # texts.append(
246
  # fig.text(
247
+ # point["Embedding (coord 1)"] + 0.02,
248
+ # point["Embedding (coord 2)"] + 0.01,
249
+ # str(point["Institution name"]),
250
  # )
251
  # )
252
  # adjust_text(texts)
 
257
  """
258
  Get the authors of an institution
259
  """
260
+ institution = institutions_table["Institution"][0]
261
  number_of_row = evt.index[0]
262
+ institution = institutions_table["Institution"][number_of_row]
263
  concepts = separate_concepts(concept_chooser)
264
  results_dfs = []
265
  for concept in concepts:
 
269
  WHERE {{
270
  ?author a <urn:acmcmc:unis:Author> .
271
  ?author <urn:acmcmc:unis:name> ?name .
272
+ ?article <urn:acmcmc:unis:written_in_institution> <{Institution}> .
273
  ?article <urn:acmcmc:unis:has_author> ?author .
274
  ?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
275
  }}
 
338
  table,
339
  btn_plot_embeddings,
340
  plot_embeddings_info,
341
+ #concept_name_label,
342
+ #concept_name_label,
343
  ],
344
  queue=True,
345
  )
institutions.csv CHANGED
The diff for this file is too large to render. See raw diff
 
model/.data-00000-of-00001 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5ded6f0bf7985926646dd021e03e008d0f8779f606e4010f0ab89cf8687e943
3
- size 87725277
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa8f3d8bd8f7a741cfe1ef560e5d2f894314342b51ec9a60844d5fc796b8e0c5
3
+ size 2350332477
model/.index CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d9027e082ae75293bde304a2044fbd0549aa0bd1b43d3483c7c28b0ab7bc72b
3
- size 291
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:364d14e1bb0830e861ef9c87ee188e8b00f90eea93ea07f828d69c3daa0a4139
3
+ size 294
model/model_metadata.ampkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7a052e205b870dba54d5a4b23c54f638d93e880c81b66e14ec1c6ae90d2cd33
3
- size 24656298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95e4a9f0906a1e60acbe7771e223dae8fa88859afb65066cef0541c1cbc78378
3
+ size 676909665