charlieoneill commited on
Commit
5df6c06
1 Parent(s): 3311ec5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -270
app.py CHANGED
@@ -40,7 +40,11 @@ def download_all_files():
40
  "astroPH_abstract_texts.json",
41
  "astroPH_feature_analysis_results_64.json",
42
  "csLG_topk_indices_64_9216_int32.npy",
43
- "astroPH_abstract_embeddings_float16.npy"
 
 
 
 
44
  ]
45
 
46
  for file in files_to_download:
@@ -66,12 +70,6 @@ client = OpenAI(api_key=api_key)
66
 
67
  # Function to load data for a specific subject
68
  def load_subject_data(subject):
69
- # embeddings_path = f"data/{subject}_abstract_embeddings.npy"
70
- # texts_path = f"data/{subject}_abstract_texts.json"
71
- # feature_analysis_path = f"data/{subject}_feature_analysis_results_{k}.json"
72
- # metadata_path = f'data/{subject}_paper_metadata.csv'
73
- # topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}.npy"
74
- # topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}.npy"
75
 
76
  embeddings_path = f"data/{subject}_abstract_embeddings_float16.npy"
77
  texts_path = f"data/{subject}_abstract_texts.json"
@@ -79,15 +77,8 @@ def load_subject_data(subject):
79
  metadata_path = f'data/{subject}_paper_metadata.csv'
80
  topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}_int32.npy"
81
  topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}_float16.npy"
82
-
83
- # abstract_embeddings = np.load(embeddings_path)
84
- # with open(texts_path, 'r') as f:
85
- # abstract_texts = json.load(f)
86
- # with open(feature_analysis_path, 'r') as f:
87
- # feature_analysis = json.load(f)
88
- # df_metadata = pd.read_csv(metadata_path)
89
- # topk_indices = np.load(topk_indices_path)
90
- # topk_values = np.load(topk_values_path)
91
 
92
  abstract_embeddings = np.load(embeddings_path).astype(np.float32) # Load float16 and convert to float32
93
  with open(texts_path, 'r') as f:
@@ -109,6 +100,14 @@ def load_subject_data(subject):
109
  decoder = weights['decoder.weight'].cpu().numpy()
110
  del weights
111
 
 
 
 
 
 
 
 
 
112
  return {
113
  'abstract_embeddings': abstract_embeddings,
114
  'abstract_texts': abstract_texts,
@@ -117,7 +116,9 @@ def load_subject_data(subject):
117
  'topk_indices': topk_indices,
118
  'topk_values': topk_values,
119
  'ae': ae,
120
- 'decoder': decoder
 
 
121
  }
122
 
123
  # Load data for both subjects
@@ -206,25 +207,6 @@ def get_feature_from_index(subject, index):
206
  feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
207
  return feature
208
 
209
- # def visualize_feature(subject, index):
210
- # feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
211
- # if feature is None:
212
- # return "Invalid feature index", None, None, None, None, None, None
213
-
214
- # output = f"# {feature['label']}\n\n"
215
- # output += f"* Pearson correlation: {feature['pearson_correlation']:.4f}\n\n"
216
- # output += f"* Density: {feature['density']:.4f}\n\n"
217
-
218
- # # Top m abstracts
219
- # top_m_abstracts = get_feature_activations(subject, index)
220
-
221
- # # Create dataframe for top abstracts
222
- # df_data = [
223
- # {"Title": m[1].split('\n\n')[0], "Activation value": f"{m[2]:.4f}"}
224
- # for m in top_m_abstracts
225
- # ]
226
- # df_top_abstracts = pd.DataFrame(df_data)
227
-
228
  def visualize_feature(subject, index):
229
  feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
230
  if feature is None:
@@ -286,62 +268,22 @@ def visualize_feature(subject, index):
286
  topk_indices_cosine = np.argsort(-cosine_similarities)[:topk]
287
  topk_values_cosine = cosine_similarities[topk_indices_cosine]
288
 
289
- # # Create dataframe for top 5 correlated features
290
- # df_top_correlated = pd.DataFrame({
291
- # "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
292
- # "Cosine similarity": topk_values_cosine
293
- # })
294
- # df_top_correlated_styled = df_top_correlated.style.format({
295
- # "Cosine similarity": "{:.4f}"
296
- # })
297
-
298
  bottomk = 5
299
  bottomk_indices_cosine = np.argsort(cosine_similarities)[:bottomk]
300
  bottomk_values_cosine = cosine_similarities[bottomk_indices_cosine]
301
 
302
- # # Create dataframe for bottom 5 correlated features
303
- # df_bottom_correlated = pd.DataFrame({
304
- # "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
305
- # "Cosine similarity": bottomk_values_cosine
306
- # })
307
- # df_bottom_correlated_styled = df_bottom_correlated.style.format({
308
- # "Cosine similarity": "{:.4f}"
309
- # })
310
-
311
- # # Co-occurrences
312
- # co_occurrences = calculate_co_occurrences(subject, index)
313
- # topk = 5
314
- # topk_indices_co_occurrence = np.argsort(-co_occurrences)[:topk]
315
- # topk_values_co_occurrence = co_occurrences[topk_indices_co_occurrence]
316
-
317
- # # Create dataframe for top 5 co-occurring features
318
- # df_co_occurrences = pd.DataFrame({
319
- # "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_co_occurrence],
320
- # "Co-occurrences": topk_values_co_occurrence
321
- # })
322
- # df_co_occurrences_styled = df_co_occurrences.style.format({
323
- # "Co-occurrences": "{:.4f}"
324
- # })
325
-
326
- # return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2
327
-
328
- # Create dataframe for top 5 correlated features
329
  df_top_correlated = pd.DataFrame({
330
  "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
331
  "Cosine similarity": topk_values_cosine
332
  })
333
- df_top_correlated_styled = df_top_correlated.style.format({
334
- "Cosine similarity": "{:.4f}"
335
- })
336
 
337
  # Create dataframe for bottom 5 correlated features
338
  df_bottom_correlated = pd.DataFrame({
339
  "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
340
  "Cosine similarity": bottomk_values_cosine
341
  })
342
- df_bottom_correlated_styled = df_bottom_correlated.style.format({
343
- "Cosine similarity": "{:.4f}"
344
- })
345
 
346
  # Co-occurrences
347
  co_occurrences = calculate_co_occurrences(subject, index)
@@ -360,97 +302,6 @@ def visualize_feature(subject, index):
360
 
361
  return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2
362
 
363
- # def visualize_feature(subject, index):
364
- # feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
365
- # if feature is None:
366
- # return "Invalid feature index", None, None, None, None, None, None
367
-
368
- # output = f"# {feature['label']}\n\n"
369
- # output += f"* Pearson correlation: {feature['pearson_correlation']:.4f}\n\n"
370
- # output += f"* Density: {feature['density']:.4f}\n\n"
371
-
372
- # # Top m abstracts
373
- # top_m_abstracts = get_feature_activations(subject, index)
374
-
375
- # # Create dataframe for top abstracts with clickable links
376
- # df_data = []
377
- # for doc_id, abstract, activation_value in top_m_abstracts:
378
- # title = abstract.split('\n\n')[0]
379
- # title = title.replace('[', '').replace(']', '')
380
- # title = title.replace("'", "")
381
- # title = title.replace('"', '')
382
- # url_id = doc_id.replace('_arXiv.txt', '')
383
- # if 'astro-ph' in url_id:
384
- # url_id = url_id.split('astro-ph')[1]
385
- # url = f"https://arxiv.org/abs/astro-ph/{url_id}"
386
- # else:
387
- # if '.' in doc_id:
388
- # url = f"https://arxiv.org/abs/{url_id}"
389
- # else:
390
- # url = f"https://arxiv.org/abs/hep-ph/{url_id}"
391
-
392
- # linked_title = f"[{title}]({url})"
393
- # df_data.append({"Title": linked_title, "Activation value": activation_value})
394
-
395
- # df_top_abstracts = pd.DataFrame(df_data)
396
-
397
- # # Activation value distribution
398
- # topk_indices = subject_data[subject]['topk_indices']
399
- # topk_values = subject_data[subject]['topk_values']
400
-
401
- # activation_values = np.where(topk_indices == index, topk_values, 0).max(axis=1)
402
- # fig2 = px.histogram(x=activation_values, nbins=50)
403
- # fig2.update_layout(
404
- # #title=f'{feature["label"]}',
405
- # xaxis_title='Activation value',
406
- # yaxis_title=None,
407
- # yaxis_type='log',
408
- # height=220,
409
- # )
410
-
411
- # # Correlated features
412
- # decoder = subject_data[subject]['decoder']
413
- # feature_vector = decoder[:, index]
414
- # decoder_without_feature = np.delete(decoder, index, axis=1)
415
- # cosine_similarities = np.dot(feature_vector, decoder_without_feature) / (np.linalg.norm(decoder_without_feature, axis=0) * np.linalg.norm(feature_vector))
416
-
417
- # topk = 5
418
- # topk_indices_cosine = np.argsort(-cosine_similarities)[:topk]
419
- # topk_values_cosine = cosine_similarities[topk_indices_cosine]
420
-
421
- # # Create dataframe for top 5 correlated features
422
- # df_top_correlated = pd.DataFrame({
423
- # "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
424
- # "Cosine similarity": [f"{v:.4f}" for v in topk_values_cosine]
425
- # })
426
- # df_top_correlated_styled = style_dataframe(df_top_correlated, is_top=True)
427
-
428
- # bottomk = 5
429
- # bottomk_indices_cosine = np.argsort(cosine_similarities)[:bottomk]
430
- # bottomk_values_cosine = cosine_similarities[bottomk_indices_cosine]
431
-
432
- # # Create dataframe for bottom 5 correlated features
433
- # df_bottom_correlated = pd.DataFrame({
434
- # "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
435
- # "Cosine similarity": [f"{v:.4f}" for v in bottomk_values_cosine]
436
- # })
437
- # df_bottom_correlated_styled = style_dataframe(df_bottom_correlated, is_top=False)
438
-
439
- # # Co-occurrences
440
- # co_occurrences = calculate_co_occurrences(subject, index)
441
- # topk = 5
442
- # topk_indices_co_occurrence = np.argsort(-co_occurrences)[:topk]
443
- # topk_values_co_occurrence = co_occurrences[topk_indices_co_occurrence]
444
-
445
- # # Create dataframe for top 5 co-occurring features
446
- # df_co_occurrences = pd.DataFrame({
447
- # "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_co_occurrence],
448
- # "Co-occurrences": topk_values_co_occurrence
449
- # })
450
-
451
- # #return output, df_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences, fig2
452
- # return output, df_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences, fig2
453
-
454
  # Modify the main interface function
455
  def create_interface():
456
  custom_css = """
@@ -504,16 +355,11 @@ def create_interface():
504
  manually_added_features_state = gr.State([])
505
 
506
  def update_search_results(feature_values, feature_indices, manually_added_features, current_subject):
507
- # ae = subject_data[current_subject]['ae']
508
- # abstract_embeddings = subject_data[current_subject]['abstract_embeddings']
509
- # abstract_texts = subject_data[current_subject]['abstract_texts']
510
- # df_metadata = subject_data[current_subject]['df_metadata']
511
  ae = subject_data[current_subject]['ae']
512
  abstract_embeddings = subject_data[current_subject]['abstract_embeddings']
513
  abstract_texts = subject_data[current_subject]['abstract_texts']
514
  df_metadata = subject_data[current_subject]['df_metadata']
515
 
516
-
517
  # Combine manually added features with query-generated features
518
  all_indices = []
519
  all_values = []
@@ -543,35 +389,6 @@ def create_interface():
543
  doc_ids = abstract_texts['doc_ids']
544
  topk_doc_ids = [doc_ids[i] for i in topk_indices_search]
545
 
546
- # # Prepare search results
547
- # search_results = []
548
- # for doc_id in topk_doc_ids:
549
- # metadata = df_metadata[df_metadata['arxiv_id'] == doc_id].iloc[0]
550
- # title = metadata['title'].replace('[', '').replace(']', '')
551
- # # Remove single quotes from title
552
- # title = title.replace("'", "")
553
-
554
- # url_id = doc_id.replace('_arXiv.txt', '')
555
- # if 'astro-ph' in url_id:
556
- # url_id = url_id.split('astro-ph')[1]
557
- # url = f"https://arxiv.org/abs/astro-ph/{url_id}"
558
- # else:
559
- # # Create the clickable link based on the doc_id
560
- # if '.' in doc_id:
561
- # url = f"https://arxiv.org/abs/{doc_id.replace('_arXiv.txt', '')}"
562
- # else:
563
- # url = f"https://arxiv.org/abs/hep-ph/{doc_id.replace('_arXiv.txt', '')}"
564
-
565
- # linked_title = f"[{title}]({url})"
566
-
567
- # search_results.append([
568
- # linked_title,
569
- # int(metadata['citation_count']),
570
- # int(metadata['year'])
571
- # ])
572
-
573
- # return search_results, all_values, all_indices
574
-
575
  # Prepare search results
576
  search_results = []
577
  for doc_id in topk_doc_ids:
@@ -704,79 +521,267 @@ def create_interface():
704
  )
705
 
706
  return [df, feature_search, feature_matches, add_button, update_button] + sliders
707
-
708
  with gr.Tab("Feature Visualisation"):
709
  gr.Markdown("# Feature Visualiser")
710
- with gr.Row():
711
- feature_search = gr.Textbox(label="Search Feature Labels")
712
- feature_matches = gr.CheckboxGroup(label="Matching Features", choices=[])
713
- visualize_button = gr.Button("Visualize Feature")
714
-
715
- feature_info = gr.Markdown()
716
- # abstracts_heading = gr.Markdown("## Top 5 Abstracts")
717
- # top_abstracts = gr.Dataframe(
718
- # headers=["Title", "Activation value"],
719
- # interactive=False
720
- # )
721
-
722
- abstracts_heading = gr.Markdown("## Top 5 Abstracts")
723
- top_abstracts = gr.Dataframe(
724
- headers=["Title", "Activation value"],
725
- datatype=["markdown", "number"],
726
- interactive=False,
727
- wrap=True
728
- )
729
 
730
- gr.Markdown("## Correlated Features")
731
- with gr.Row():
732
- with gr.Column(scale=1):
733
- gr.Markdown("### Top 5 Correlated Features")
734
- top_correlated = gr.Dataframe(
735
- headers=["Feature", "Cosine similarity"],
736
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  )
738
- with gr.Column(scale=1):
739
- gr.Markdown("### Bottom 5 Correlated Features")
740
- bottom_correlated = gr.Dataframe(
741
- headers=["Feature", "Cosine similarity"],
742
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
  )
744
-
745
- with gr.Row():
746
- with gr.Column(scale=1):
747
- gr.Markdown("## Top 5 Co-occurring Features")
748
- co_occurring_features = gr.Dataframe(
749
- headers=["Feature", "Co-occurrences"],
750
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  )
752
- with gr.Column(scale=1):
753
- gr.Markdown(f"## Activation Value Distribution")
754
- activation_dist = gr.Plot()
755
-
756
- def search_feature_labels(search_text, current_subject):
757
- if not search_text:
758
- return gr.CheckboxGroup(choices=[])
759
- matches = [f"{f['label']} ({f['index']})" for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
760
- return gr.CheckboxGroup(choices=matches[:10])
761
-
762
- feature_search.change(search_feature_labels, inputs=[feature_search, subject], outputs=[feature_matches])
763
-
764
- def on_visualize(selected_features, current_subject):
765
- if not selected_features:
766
- return "Please select a feature to visualize.", None, None, None, None, None, "", []
767
-
768
- # Extract the feature index from the selected feature string
769
- feature_index = int(selected_features[0].split('(')[-1].strip(')'))
770
- feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist = visualize_feature(current_subject, feature_index)
771
-
772
- # Return the visualization results along with empty values for search box and checkbox
773
- return feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, "", []
774
 
775
- visualize_button.click(
776
- on_visualize,
777
- inputs=[feature_matches, subject],
778
- outputs=[feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, feature_search, feature_matches]
779
- )
780
 
781
  # Add logic to update components when subject changes
782
  def on_subject_change(new_subject):
@@ -797,4 +802,3 @@ def create_interface():
797
  if __name__ == "__main__":
798
  demo = create_interface()
799
  demo.launch()
800
-
 
40
  "astroPH_abstract_texts.json",
41
  "astroPH_feature_analysis_results_64.json",
42
  "csLG_topk_indices_64_9216_int32.npy",
43
+ "astroPH_abstract_embeddings_float16.npy",
44
+ # "csLG_clean_families_64_9216.json",
45
+ # "astroPH_clean_families_64_9216.json",
46
+ "astroPH_family_analysis_64_9216.json",
47
+ "csLG_family_analysis_64_9216.json"
48
  ]
49
 
50
  for file in files_to_download:
 
70
 
71
  # Function to load data for a specific subject
72
  def load_subject_data(subject):
 
 
 
 
 
 
73
 
74
  embeddings_path = f"data/{subject}_abstract_embeddings_float16.npy"
75
  texts_path = f"data/{subject}_abstract_texts.json"
 
77
  metadata_path = f'data/{subject}_paper_metadata.csv'
78
  topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}_int32.npy"
79
  topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}_float16.npy"
80
+ families_path = f"data/{subject}_clean_families_{k}_{n_dirs}.json"
81
+ family_analysis_path = f"data/{subject}_family_analysis_{k}_{n_dirs}.json"
 
 
 
 
 
 
 
82
 
83
  abstract_embeddings = np.load(embeddings_path).astype(np.float32) # Load float16 and convert to float32
84
  with open(texts_path, 'r') as f:
 
100
  decoder = weights['decoder.weight'].cpu().numpy()
101
  del weights
102
 
103
+ # # Load feature families
104
+ # with open(families_path, 'r') as f:
105
+ # feature_families = json.load(f)
106
+
107
+ with open(family_analysis_path, 'r') as f:
108
+ family_analysis = json.load(f)
109
+
110
+
111
  return {
112
  'abstract_embeddings': abstract_embeddings,
113
  'abstract_texts': abstract_texts,
 
116
  'topk_indices': topk_indices,
117
  'topk_values': topk_values,
118
  'ae': ae,
119
+ 'decoder': decoder,
120
+ # 'feature_families': feature_families,
121
+ 'family_analysis': family_analysis
122
  }
123
 
124
  # Load data for both subjects
 
207
  feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
208
  return feature
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def visualize_feature(subject, index):
211
  feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
212
  if feature is None:
 
268
  topk_indices_cosine = np.argsort(-cosine_similarities)[:topk]
269
  topk_values_cosine = cosine_similarities[topk_indices_cosine]
270
 
 
 
 
 
 
 
 
 
 
271
  bottomk = 5
272
  bottomk_indices_cosine = np.argsort(cosine_similarities)[:bottomk]
273
  bottomk_values_cosine = cosine_similarities[bottomk_indices_cosine]
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  df_top_correlated = pd.DataFrame({
276
  "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
277
  "Cosine similarity": topk_values_cosine
278
  })
279
+ df_top_correlated_styled = style_dataframe(df_top_correlated, is_top=True)
 
 
280
 
281
  # Create dataframe for bottom 5 correlated features
282
  df_bottom_correlated = pd.DataFrame({
283
  "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
284
  "Cosine similarity": bottomk_values_cosine
285
  })
286
+ df_bottom_correlated_styled = style_dataframe(df_bottom_correlated, is_top=False)
 
 
287
 
288
  # Co-occurrences
289
  co_occurrences = calculate_co_occurrences(subject, index)
 
302
 
303
  return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2
304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  # Modify the main interface function
306
  def create_interface():
307
  custom_css = """
 
355
  manually_added_features_state = gr.State([])
356
 
357
  def update_search_results(feature_values, feature_indices, manually_added_features, current_subject):
 
 
 
 
358
  ae = subject_data[current_subject]['ae']
359
  abstract_embeddings = subject_data[current_subject]['abstract_embeddings']
360
  abstract_texts = subject_data[current_subject]['abstract_texts']
361
  df_metadata = subject_data[current_subject]['df_metadata']
362
 
 
363
  # Combine manually added features with query-generated features
364
  all_indices = []
365
  all_values = []
 
389
  doc_ids = abstract_texts['doc_ids']
390
  topk_doc_ids = [doc_ids[i] for i in topk_indices_search]
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  # Prepare search results
393
  search_results = []
394
  for doc_id in topk_doc_ids:
 
521
  )
522
 
523
  return [df, feature_search, feature_matches, add_button, update_button] + sliders
524
+
525
  with gr.Tab("Feature Visualisation"):
526
  gr.Markdown("# Feature Visualiser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
+ with gr.Tabs():
529
+ with gr.Tab("Individual Features"):
530
+ with gr.Row():
531
+ feature_search = gr.Textbox(label="Search Feature Labels")
532
+ feature_matches = gr.CheckboxGroup(label="Matching Features", choices=[])
533
+ visualize_button = gr.Button("Visualize Feature")
534
+
535
+ feature_info = gr.Markdown()
536
+ # abstracts_heading = gr.Markdown("## Top 5 Abstracts")
537
+ # top_abstracts = gr.Dataframe(
538
+ # headers=["Title", "Activation value"],
539
+ # interactive=False
540
+ # )
541
+
542
+ abstracts_heading = gr.Markdown("## Top 5 Abstracts")
543
+ top_abstracts = gr.Dataframe(
544
+ headers=["Title", "Activation value"],
545
+ datatype=["markdown", "number"],
546
+ interactive=False,
547
+ wrap=True
548
  )
549
+
550
+ gr.Markdown("## Correlated Features")
551
+ with gr.Row():
552
+ with gr.Column(scale=1):
553
+ gr.Markdown("### Top 5 Correlated Features")
554
+ top_correlated = gr.Dataframe(
555
+ headers=["Feature", "Cosine similarity"],
556
+ interactive=False
557
+ )
558
+ with gr.Column(scale=1):
559
+ gr.Markdown("### Bottom 5 Correlated Features")
560
+ bottom_correlated = gr.Dataframe(
561
+ headers=["Feature", "Cosine similarity"],
562
+ interactive=False
563
+ )
564
+
565
+ with gr.Row():
566
+ with gr.Column(scale=1):
567
+ gr.Markdown("## Top 5 Co-occurring Features")
568
+ co_occurring_features = gr.Dataframe(
569
+ headers=["Feature", "Co-occurrences"],
570
+ interactive=False
571
+ )
572
+ with gr.Column(scale=1):
573
+ gr.Markdown(f"## Activation Value Distribution")
574
+ activation_dist = gr.Plot()
575
+
576
+ def search_feature_labels(search_text, current_subject):
577
+ if not search_text:
578
+ return gr.CheckboxGroup(choices=[])
579
+ matches = [f"{f['label']} ({f['index']})" for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
580
+ return gr.CheckboxGroup(choices=matches[:10])
581
+
582
+ feature_search.change(search_feature_labels, inputs=[feature_search, subject], outputs=[feature_matches])
583
+
584
+ def on_visualize(selected_features, current_subject):
585
+ if not selected_features:
586
+ return "Please select a feature to visualize.", None, None, None, None, None, "", []
587
+
588
+ # Extract the feature index from the selected feature string
589
+ feature_index = int(selected_features[0].split('(')[-1].strip(')'))
590
+ feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist = visualize_feature(current_subject, feature_index)
591
+
592
+ # Return the visualization results along with empty values for search box and checkbox
593
+ return feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, "", []
594
+
595
+ visualize_button.click(
596
+ on_visualize,
597
+ inputs=[feature_matches, subject],
598
+ outputs=[feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, feature_search, feature_matches]
599
  )
600
+ # with gr.Row():
601
+ # feature_search = gr.Textbox(label="Search Feature Labels")
602
+ # feature_matches = gr.CheckboxGroup(label="Matching Features", choices=[])
603
+ # visualize_button = gr.Button("Visualize Feature")
604
+
605
+ # feature_info = gr.Markdown()
606
+
607
+ # abstracts_heading = gr.Markdown("## Top 5 Abstracts")
608
+ # top_abstracts = gr.Dataframe(
609
+ # headers=["Title", "Activation value"],
610
+ # datatype=["markdown", "number"],
611
+ # interactive=False,
612
+ # wrap=True
613
+ # )
614
+
615
+ # gr.Markdown("## Correlated Features")
616
+ # with gr.Row():
617
+ # with gr.Column(scale=1):
618
+ # gr.Markdown("### Top 5 Correlated Features")
619
+ # top_correlated = gr.Dataframe(
620
+ # headers=["Feature", "Cosine similarity"],
621
+ # interactive=False
622
+ # )
623
+ # with gr.Column(scale=1):
624
+ # gr.Markdown("### Bottom 5 Correlated Features")
625
+ # bottom_correlated = gr.Dataframe(
626
+ # headers=["Feature", "Cosine similarity"],
627
+ # interactive=False
628
+ # )
629
+
630
+ # with gr.Row():
631
+ # with gr.Column(scale=1):
632
+ # gr.Markdown("## Top 5 Co-occurring Features")
633
+ # co_occurring_features = gr.Dataframe(
634
+ # headers=["Feature", "Co-occurrences"],
635
+ # interactive=False
636
+ # )
637
+ # with gr.Column(scale=1):
638
+ # gr.Markdown(f"## Activation Value Distribution")
639
+ # activation_dist = gr.Plot()
640
+
641
+ with gr.Tab("Feature Families"):
642
+ gr.Markdown("# Feature Families")
643
+
644
+ with gr.Row():
645
+ family_search = gr.Textbox(label="Search Feature Families")
646
+ family_matches = gr.CheckboxGroup(label="Matching Feature Families", choices=[])
647
+ visualize_family_button = gr.Button("Visualize Feature Family")
648
+
649
+ family_info = gr.Markdown()
650
+ family_dataframe = gr.Dataframe(
651
+ headers=["Feature", "F1 Score", "Pearson Correlation"],
652
+ datatype=["markdown", "number", "number"],
653
+ label="Family and Child Features"
654
+ )
655
+ # family_dataframe = gr.Dataframe(
656
+ # headers=["Feature", "F1 Score", "Pearson Correlation"],
657
+ # datatype=["str", "number", "number"],
658
+ # label="Family and Child Features"
659
+ # )
660
+
661
+ def search_feature_families(search_text, current_subject):
662
+ family_analysis = subject_data[current_subject]['family_analysis']
663
+ if not search_text:
664
+ return gr.CheckboxGroup(choices=[])
665
+ matches = [family['superfeature'] for family in family_analysis if search_text.lower() in family['superfeature'].lower()]
666
+ return gr.CheckboxGroup(choices=matches[:10]) # Limit to top 10 matches
667
+
668
+ # def visualize_feature_family(selected_families, current_subject):
669
+ # if not selected_families:
670
+ # return "Please select a feature family to visualize.", None
671
+
672
+ # selected_family = selected_families[0] # Take the first selected family
673
+ # family_analysis = subject_data[current_subject]['family_analysis']
674
+
675
+ # family_data = next((family for family in family_analysis if family['superfeature'] == selected_family), None)
676
+ # if not family_data:
677
+ # return "Invalid feature family selected.", None
678
+
679
+ # output = f"# {family_data['superfeature']}\n\n"
680
+ # output += f"## Super Reasoning\n{family_data['super_reasoning']}\n\n"
681
+
682
+ # # Create DataFrame
683
+ # df_data = [
684
+ # {
685
+ # "Feature": family_data['superfeature'],
686
+ # "F1 Score": family_data['family_f1'],
687
+ # "Pearson Correlation": family_data['family_pearson']
688
+ # }
689
+ # ]
690
+
691
+ # for name, f1, pearson in zip(family_data['feature_names'], family_data['feature_f1'], family_data['feature_pearson']):
692
+ # df_data.append({
693
+ # "Feature": name,
694
+ # "F1 Score": f1,
695
+ # "Pearson Correlation": pearson
696
+ # })
697
+
698
+ # df = pd.DataFrame(df_data)
699
+
700
+ # return output, df
701
+
702
+ # def visualize_feature_family(selected_families, current_subject):
703
+ # if not selected_families:
704
+ # return "Please select a feature family to visualize.", None, "", []
705
+
706
+ # selected_family = selected_families[0] # Take the first selected family
707
+ # family_analysis = subject_data[current_subject]['family_analysis']
708
+
709
+ # family_data = next((family for family in family_analysis if family['superfeature'] == selected_family), None)
710
+ # if not family_data:
711
+ # return "Invalid feature family selected.", None, "", []
712
+
713
+ # output = f"# {family_data['superfeature']}\n\n"
714
+ # output += f"## Super Reasoning\n{family_data['super_reasoning']}\n\n"
715
+
716
+ # # Create DataFrame
717
+ # df_data = [
718
+ # {
719
+ # "Feature": family_data['superfeature'],
720
+ # "F1 Score": family_data['family_f1'],
721
+ # "Pearson Correlation": family_data['family_pearson']
722
+ # }
723
+ # ]
724
+
725
+ # for name, f1, pearson in zip(family_data['feature_names'], family_data['feature_f1'], family_data['feature_pearson']):
726
+ # df_data.append({
727
+ # "Feature": name,
728
+ # "F1 Score": f1,
729
+ # "Pearson Correlation": pearson
730
+ # })
731
+
732
+ # df = pd.DataFrame(df_data)
733
+
734
+ # return output, df, "", [] # Return empty string for search box and empty list for checkbox
735
+
736
+ def visualize_feature_family(selected_families, current_subject):
737
+ if not selected_families:
738
+ return "Please select a feature family to visualize.", None, "", []
739
+
740
+ selected_family = selected_families[0] # Take the first selected family
741
+ family_analysis = subject_data[current_subject]['family_analysis']
742
+
743
+ family_data = next((family for family in family_analysis if family['superfeature'] == selected_family), None)
744
+ if not family_data:
745
+ return "Invalid feature family selected.", None, "", []
746
+
747
+ output = f"# {family_data['superfeature']}\n\n"
748
+
749
+ # Create DataFrame
750
+ df_data = [
751
+ {
752
+ "Feature": f"## {family_data['superfeature']}",
753
+ "F1 Score": round(family_data['family_f1'], 2),
754
+ "Pearson Correlation": round(family_data['family_pearson'], 4)
755
+ },
756
+ # {
757
+ # "Feature": "## Child Features",
758
+ # "F1 Score": None,
759
+ # "Pearson Correlation": None
760
+ # }
761
+ ]
762
+
763
+ for name, f1, pearson in zip(family_data['feature_names'], family_data['feature_f1'], family_data['feature_pearson']):
764
+ df_data.append({
765
+ "Feature": name,
766
+ "F1 Score": round(f1, 2),
767
+ "Pearson Correlation": round(pearson, 4)
768
+ })
769
+
770
+ df = pd.DataFrame(df_data)
771
+
772
+ # Add super reasoning below the dataframe
773
+ output += "## Super Reasoning\n"
774
+ output += f"{family_data['super_reasoning']}\n\n"
775
+
776
+ return output, df, "", [] # Return empty string for search box and empty list for checkbox
777
+
778
+ family_search.change(search_feature_families, inputs=[family_search, subject], outputs=[family_matches])
779
+ visualize_family_button.click(
780
+ visualize_feature_family,
781
+ inputs=[family_matches, subject],
782
+ outputs=[family_info, family_dataframe, family_search, family_matches]
783
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
 
 
 
 
 
 
785
 
786
  # Add logic to update components when subject changes
787
  def on_subject_change(new_subject):
 
802
  if __name__ == "__main__":
803
  demo = create_interface()
804
  demo.launch()