Spaces:

charlieoneill
/

saerch.ai

Running

App Files Files Community

charlieoneill commited on Jul 30

Commit

5df6c06

•

1 Parent(s): 3311ec5

Update app.py

Browse files

Files changed (1) hide show

app.py +274 -270

app.py CHANGED Viewed

@@ -40,7 +40,11 @@ def download_all_files():
         "astroPH_abstract_texts.json",
         "astroPH_feature_analysis_results_64.json",
         "csLG_topk_indices_64_9216_int32.npy",
-        "astroPH_abstract_embeddings_float16.npy"
     ]
     for file in files_to_download:
@@ -66,12 +70,6 @@ client = OpenAI(api_key=api_key)
 # Function to load data for a specific subject
 def load_subject_data(subject):
-    # embeddings_path = f"data/{subject}_abstract_embeddings.npy"
-    # texts_path = f"data/{subject}_abstract_texts.json"
-    # feature_analysis_path = f"data/{subject}_feature_analysis_results_{k}.json"
-    # metadata_path = f'data/{subject}_paper_metadata.csv'
-    # topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}.npy"
-    # topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}.npy"
     embeddings_path = f"data/{subject}_abstract_embeddings_float16.npy"
     texts_path = f"data/{subject}_abstract_texts.json"
@@ -79,15 +77,8 @@ def load_subject_data(subject):
     metadata_path = f'data/{subject}_paper_metadata.csv'
     topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}_int32.npy"
     topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}_float16.npy"
-    # abstract_embeddings = np.load(embeddings_path)
-    # with open(texts_path, 'r') as f:
-    #     abstract_texts = json.load(f)
-    # with open(feature_analysis_path, 'r') as f:
-    #     feature_analysis = json.load(f)
-    # df_metadata = pd.read_csv(metadata_path)
-    # topk_indices = np.load(topk_indices_path)
-    # topk_values = np.load(topk_values_path)
     abstract_embeddings = np.load(embeddings_path).astype(np.float32)  # Load float16 and convert to float32
     with open(texts_path, 'r') as f:
@@ -109,6 +100,14 @@ def load_subject_data(subject):
     decoder = weights['decoder.weight'].cpu().numpy()
     del weights
     return {
         'abstract_embeddings': abstract_embeddings,
         'abstract_texts': abstract_texts,
@@ -117,7 +116,9 @@ def load_subject_data(subject):
         'topk_indices': topk_indices,
         'topk_values': topk_values,
         'ae': ae,
-        'decoder': decoder
     }
 # Load data for both subjects
@@ -206,25 +207,6 @@ def get_feature_from_index(subject, index):
     feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
     return feature
-# def visualize_feature(subject, index):
-#     feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
-#     if feature is None:
-#         return "Invalid feature index", None, None, None, None, None, None
-#     output = f"# {feature['label']}\n\n"
-#     output += f"* Pearson correlation: {feature['pearson_correlation']:.4f}\n\n"
-#     output += f"* Density: {feature['density']:.4f}\n\n"
-#     # Top m abstracts
-#     top_m_abstracts = get_feature_activations(subject, index)
-#     # Create dataframe for top abstracts
-#     df_data = [
-#         {"Title": m[1].split('\n\n')[0], "Activation value": f"{m[2]:.4f}"}
-#         for m in top_m_abstracts
-#     ]
-#     df_top_abstracts = pd.DataFrame(df_data)
 def visualize_feature(subject, index):
     feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
     if feature is None:
@@ -286,62 +268,22 @@ def visualize_feature(subject, index):
     topk_indices_cosine = np.argsort(-cosine_similarities)[:topk]
     topk_values_cosine = cosine_similarities[topk_indices_cosine]
-    # # Create dataframe for top 5 correlated features
-    # df_top_correlated = pd.DataFrame({
-    #     "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
-    #     "Cosine similarity": topk_values_cosine
-    # })
-    # df_top_correlated_styled = df_top_correlated.style.format({
-    #     "Cosine similarity": "{:.4f}"
-    # })
     bottomk = 5
     bottomk_indices_cosine = np.argsort(cosine_similarities)[:bottomk]
     bottomk_values_cosine = cosine_similarities[bottomk_indices_cosine]
-    # # Create dataframe for bottom 5 correlated features
-    # df_bottom_correlated = pd.DataFrame({
-    #     "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
-    #     "Cosine similarity": bottomk_values_cosine
-    # })
-    # df_bottom_correlated_styled = df_bottom_correlated.style.format({
-    #     "Cosine similarity": "{:.4f}"
-    # })
-    # # Co-occurrences
-    # co_occurrences = calculate_co_occurrences(subject, index)
-    # topk = 5
-    # topk_indices_co_occurrence = np.argsort(-co_occurrences)[:topk]
-    # topk_values_co_occurrence = co_occurrences[topk_indices_co_occurrence]
-    # # Create dataframe for top 5 co-occurring features
-    # df_co_occurrences = pd.DataFrame({
-    #     "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_co_occurrence],
-    #     "Co-occurrences": topk_values_co_occurrence
-    # })
-    # df_co_occurrences_styled = df_co_occurrences.style.format({
-    #     "Co-occurrences": "{:.4f}"
-    # })
-    # return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2
-     # Create dataframe for top 5 correlated features
     df_top_correlated = pd.DataFrame({
         "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
         "Cosine similarity": topk_values_cosine
     })
-    df_top_correlated_styled = df_top_correlated.style.format({
-        "Cosine similarity": "{:.4f}"
-    })
     # Create dataframe for bottom 5 correlated features
     df_bottom_correlated = pd.DataFrame({
         "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
         "Cosine similarity": bottomk_values_cosine
     })
-    df_bottom_correlated_styled = df_bottom_correlated.style.format({
-        "Cosine similarity": "{:.4f}"
-    })
     # Co-occurrences
     co_occurrences = calculate_co_occurrences(subject, index)
@@ -360,97 +302,6 @@ def visualize_feature(subject, index):
     return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2
-# def visualize_feature(subject, index):
-#     feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
-#     if feature is None:
-#         return "Invalid feature index", None, None, None, None, None, None
-#     output = f"# {feature['label']}\n\n"
-#     output += f"* Pearson correlation: {feature['pearson_correlation']:.4f}\n\n"
-#     output += f"* Density: {feature['density']:.4f}\n\n"
-#     # Top m abstracts
-#     top_m_abstracts = get_feature_activations(subject, index)
-#     # Create dataframe for top abstracts with clickable links
-#     df_data = []
-#     for doc_id, abstract, activation_value in top_m_abstracts:
-#         title = abstract.split('\n\n')[0]
-#         title = title.replace('[', '').replace(']', '')
-#         title = title.replace("'", "")
-#         title = title.replace('"', '')
-#         url_id = doc_id.replace('_arXiv.txt', '')
-#         if 'astro-ph' in url_id:
-#             url_id = url_id.split('astro-ph')[1]
-#             url = f"https://arxiv.org/abs/astro-ph/{url_id}"
-#         else:
-#             if '.' in doc_id:
-#                 url = f"https://arxiv.org/abs/{url_id}"
-#             else:
-#                 url = f"https://arxiv.org/abs/hep-ph/{url_id}"
-#         linked_title = f"[{title}]({url})"
-#         df_data.append({"Title": linked_title, "Activation value": activation_value})
-#     df_top_abstracts = pd.DataFrame(df_data)
-#     # Activation value distribution
-#     topk_indices = subject_data[subject]['topk_indices']
-#     topk_values = subject_data[subject]['topk_values']
-#     activation_values = np.where(topk_indices == index, topk_values, 0).max(axis=1)
-#     fig2 = px.histogram(x=activation_values, nbins=50)
-#     fig2.update_layout(
-#         #title=f'{feature["label"]}',
-#         xaxis_title='Activation value',
-#         yaxis_title=None,
-#         yaxis_type='log',
-#         height=220,
-#     )
-#     # Correlated features
-#     decoder = subject_data[subject]['decoder']
-#     feature_vector = decoder[:, index]
-#     decoder_without_feature = np.delete(decoder, index, axis=1)
-#     cosine_similarities = np.dot(feature_vector, decoder_without_feature) / (np.linalg.norm(decoder_without_feature, axis=0) * np.linalg.norm(feature_vector))
-#     topk = 5
-#     topk_indices_cosine = np.argsort(-cosine_similarities)[:topk]
-#     topk_values_cosine = cosine_similarities[topk_indices_cosine]
-#     # Create dataframe for top 5 correlated features
-#     df_top_correlated = pd.DataFrame({
-#         "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
-#         "Cosine similarity": [f"{v:.4f}" for v in topk_values_cosine]
-#     })
-#     df_top_correlated_styled = style_dataframe(df_top_correlated, is_top=True)
-#     bottomk = 5
-#     bottomk_indices_cosine = np.argsort(cosine_similarities)[:bottomk]
-#     bottomk_values_cosine = cosine_similarities[bottomk_indices_cosine]
-#     # Create dataframe for bottom 5 correlated features
-#     df_bottom_correlated = pd.DataFrame({
-#         "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
-#         "Cosine similarity": [f"{v:.4f}" for v in bottomk_values_cosine]
-#     })
-#     df_bottom_correlated_styled = style_dataframe(df_bottom_correlated, is_top=False)
-#     # Co-occurrences
-#     co_occurrences = calculate_co_occurrences(subject, index)
-#     topk = 5
-#     topk_indices_co_occurrence = np.argsort(-co_occurrences)[:topk]
-#     topk_values_co_occurrence = co_occurrences[topk_indices_co_occurrence]
-#     # Create dataframe for top 5 co-occurring features
-#     df_co_occurrences = pd.DataFrame({
-#         "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_co_occurrence],
-#         "Co-occurrences": topk_values_co_occurrence
-#     })
-#     #return output, df_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences, fig2
-#     return output, df_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences, fig2
 # Modify the main interface function
 def create_interface():
     custom_css = """
@@ -504,16 +355,11 @@ def create_interface():
                 manually_added_features_state = gr.State([])
                 def update_search_results(feature_values, feature_indices, manually_added_features, current_subject):
-                    # ae = subject_data[current_subject]['ae']
-                    # abstract_embeddings = subject_data[current_subject]['abstract_embeddings']
-                    # abstract_texts = subject_data[current_subject]['abstract_texts']
-                    # df_metadata = subject_data[current_subject]['df_metadata']
                     ae = subject_data[current_subject]['ae']
                     abstract_embeddings = subject_data[current_subject]['abstract_embeddings']
                     abstract_texts = subject_data[current_subject]['abstract_texts']
                     df_metadata = subject_data[current_subject]['df_metadata']
                     # Combine manually added features with query-generated features
                     all_indices = []
                     all_values = []
@@ -543,35 +389,6 @@ def create_interface():
                     doc_ids = abstract_texts['doc_ids']
                     topk_doc_ids = [doc_ids[i] for i in topk_indices_search]
-                    # # Prepare search results
-                    # search_results = []
-                    # for doc_id in topk_doc_ids:
-                    #     metadata = df_metadata[df_metadata['arxiv_id'] == doc_id].iloc[0]
-                    #     title = metadata['title'].replace('[', '').replace(']', '')
-                    #     # Remove single quotes from title
-                    #     title = title.replace("'", "")
-                    #     url_id = doc_id.replace('_arXiv.txt', '')
-                    #     if 'astro-ph' in url_id:
-                    #         url_id = url_id.split('astro-ph')[1]
-                    #         url = f"https://arxiv.org/abs/astro-ph/{url_id}"
-                    #     else:
-                    #         # Create the clickable link based on the doc_id
-                    #         if '.' in doc_id:
-                    #             url = f"https://arxiv.org/abs/{doc_id.replace('_arXiv.txt', '')}"
-                    #         else:
-                    #             url = f"https://arxiv.org/abs/hep-ph/{doc_id.replace('_arXiv.txt', '')}"
-                    #     linked_title = f"[{title}]({url})"
-                    #     search_results.append([
-                    #         linked_title,
-                    #         int(metadata['citation_count']),
-                    #         int(metadata['year'])
-                    #     ])
-                    # return search_results, all_values, all_indices
                     # Prepare search results
                     search_results = []
                     for doc_id in topk_doc_ids:
@@ -704,79 +521,267 @@ def create_interface():
                     )
                     return [df, feature_search, feature_matches, add_button, update_button] + sliders
             with gr.Tab("Feature Visualisation"):
                 gr.Markdown("# Feature Visualiser")
-                with gr.Row():
-                    feature_search = gr.Textbox(label="Search Feature Labels")
-                    feature_matches = gr.CheckboxGroup(label="Matching Features", choices=[])
-                    visualize_button = gr.Button("Visualize Feature")
-                feature_info = gr.Markdown()
-                # abstracts_heading = gr.Markdown("## Top 5 Abstracts")
-                # top_abstracts = gr.Dataframe(
-                #     headers=["Title", "Activation value"],
-                #     interactive=False
-                # )
-                abstracts_heading = gr.Markdown("## Top 5 Abstracts")
-                top_abstracts = gr.Dataframe(
-                    headers=["Title", "Activation value"],
-                    datatype=["markdown", "number"],
-                    interactive=False,
-                    wrap=True
-                )
-                gr.Markdown("## Correlated Features")
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.Markdown("### Top 5 Correlated Features")
-                        top_correlated = gr.Dataframe(
-                            headers=["Feature", "Cosine similarity"],
-                            interactive=False
                         )
-                    with gr.Column(scale=1):
-                        gr.Markdown("### Bottom 5 Correlated Features")
-                        bottom_correlated = gr.Dataframe(
-                            headers=["Feature", "Cosine similarity"],
-                            interactive=False
                         )
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.Markdown("## Top 5 Co-occurring Features")
-                        co_occurring_features = gr.Dataframe(
-                            headers=["Feature", "Co-occurrences"],
-                            interactive=False
                         )
-                    with gr.Column(scale=1):
-                        gr.Markdown(f"## Activation Value Distribution")
-                        activation_dist = gr.Plot()
-                def search_feature_labels(search_text, current_subject):
-                    if not search_text:
-                        return gr.CheckboxGroup(choices=[])
-                    matches = [f"{f['label']} ({f['index']})" for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
-                    return gr.CheckboxGroup(choices=matches[:10])
-                feature_search.change(search_feature_labels, inputs=[feature_search, subject], outputs=[feature_matches])
-                def on_visualize(selected_features, current_subject):
-                    if not selected_features:
-                        return "Please select a feature to visualize.", None, None, None, None, None, "", []
-                    # Extract the feature index from the selected feature string
-                    feature_index = int(selected_features[0].split('(')[-1].strip(')'))
-                    feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist = visualize_feature(current_subject, feature_index)
-                    # Return the visualization results along with empty values for search box and checkbox
-                    return feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, "", []
-                visualize_button.click(
-                    on_visualize,
-                    inputs=[feature_matches, subject],
-                    outputs=[feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, feature_search, feature_matches]
-                )
         # Add logic to update components when subject changes
         def on_subject_change(new_subject):
@@ -797,4 +802,3 @@ def create_interface():
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch()

         "astroPH_abstract_texts.json",
         "astroPH_feature_analysis_results_64.json",
         "csLG_topk_indices_64_9216_int32.npy",
+        "astroPH_abstract_embeddings_float16.npy",
+        # "csLG_clean_families_64_9216.json",
+        # "astroPH_clean_families_64_9216.json",
+        "astroPH_family_analysis_64_9216.json",
+        "csLG_family_analysis_64_9216.json"
     ]
     for file in files_to_download:
 # Function to load data for a specific subject
 def load_subject_data(subject):
     embeddings_path = f"data/{subject}_abstract_embeddings_float16.npy"
     texts_path = f"data/{subject}_abstract_texts.json"
     metadata_path = f'data/{subject}_paper_metadata.csv'
     topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}_int32.npy"
     topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}_float16.npy"
+    families_path = f"data/{subject}_clean_families_{k}_{n_dirs}.json"
+    family_analysis_path = f"data/{subject}_family_analysis_{k}_{n_dirs}.json"
     abstract_embeddings = np.load(embeddings_path).astype(np.float32)  # Load float16 and convert to float32
     with open(texts_path, 'r') as f:
     decoder = weights['decoder.weight'].cpu().numpy()
     del weights
+    # # Load feature families
+    # with open(families_path, 'r') as f:
+    #     feature_families = json.load(f)
+    with open(family_analysis_path, 'r') as f:
+        family_analysis = json.load(f)
     return {
         'abstract_embeddings': abstract_embeddings,
         'abstract_texts': abstract_texts,
         'topk_indices': topk_indices,
         'topk_values': topk_values,
         'ae': ae,
+        'decoder': decoder,
+        # 'feature_families': feature_families,
+        'family_analysis': family_analysis
     }
 # Load data for both subjects
     feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
     return feature
 def visualize_feature(subject, index):
     feature = next((f for f in subject_data[subject]['feature_analysis'] if f['index'] == index), None)
     if feature is None:
     topk_indices_cosine = np.argsort(-cosine_similarities)[:topk]
     topk_values_cosine = cosine_similarities[topk_indices_cosine]
     bottomk = 5
     bottomk_indices_cosine = np.argsort(cosine_similarities)[:bottomk]
     bottomk_values_cosine = cosine_similarities[bottomk_indices_cosine]
     df_top_correlated = pd.DataFrame({
         "Feature": [get_feature_from_index(subject, i)['label'] for i in topk_indices_cosine],
         "Cosine similarity": topk_values_cosine
     })
+    df_top_correlated_styled = style_dataframe(df_top_correlated, is_top=True)
     # Create dataframe for bottom 5 correlated features
     df_bottom_correlated = pd.DataFrame({
         "Feature": [get_feature_from_index(subject, i)['label'] for i in bottomk_indices_cosine],
         "Cosine similarity": bottomk_values_cosine
     })
+    df_bottom_correlated_styled = style_dataframe(df_bottom_correlated, is_top=False)
     # Co-occurrences
     co_occurrences = calculate_co_occurrences(subject, index)
     return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2
 # Modify the main interface function
 def create_interface():
     custom_css = """
                 manually_added_features_state = gr.State([])
                 def update_search_results(feature_values, feature_indices, manually_added_features, current_subject):
                     ae = subject_data[current_subject]['ae']
                     abstract_embeddings = subject_data[current_subject]['abstract_embeddings']
                     abstract_texts = subject_data[current_subject]['abstract_texts']
                     df_metadata = subject_data[current_subject]['df_metadata']
                     # Combine manually added features with query-generated features
                     all_indices = []
                     all_values = []
                     doc_ids = abstract_texts['doc_ids']
                     topk_doc_ids = [doc_ids[i] for i in topk_indices_search]
                     # Prepare search results
                     search_results = []
                     for doc_id in topk_doc_ids:
                     )
                     return [df, feature_search, feature_matches, add_button, update_button] + sliders
             with gr.Tab("Feature Visualisation"):
                 gr.Markdown("# Feature Visualiser")
+                with gr.Tabs():
+                    with gr.Tab("Individual Features"):
+                        with gr.Row():
+                            feature_search = gr.Textbox(label="Search Feature Labels")
+                            feature_matches = gr.CheckboxGroup(label="Matching Features", choices=[])
+                            visualize_button = gr.Button("Visualize Feature")
+                        feature_info = gr.Markdown()
+                        # abstracts_heading = gr.Markdown("## Top 5 Abstracts")
+                        # top_abstracts = gr.Dataframe(
+                        #     headers=["Title", "Activation value"],
+                        #     interactive=False
+                        # )
+                        abstracts_heading = gr.Markdown("## Top 5 Abstracts")
+                        top_abstracts = gr.Dataframe(
+                            headers=["Title", "Activation value"],
+                            datatype=["markdown", "number"],
+                            interactive=False,
+                            wrap=True
                         )
+                        gr.Markdown("## Correlated Features")
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                gr.Markdown("### Top 5 Correlated Features")
+                                top_correlated = gr.Dataframe(
+                                    headers=["Feature", "Cosine similarity"],
+                                    interactive=False
+                                )
+                            with gr.Column(scale=1):
+                                gr.Markdown("### Bottom 5 Correlated Features")
+                                bottom_correlated = gr.Dataframe(
+                                    headers=["Feature", "Cosine similarity"],
+                                    interactive=False
+                                )
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                gr.Markdown("## Top 5 Co-occurring Features")
+                                co_occurring_features = gr.Dataframe(
+                                    headers=["Feature", "Co-occurrences"],
+                                    interactive=False
+                                )
+                            with gr.Column(scale=1):
+                                gr.Markdown(f"## Activation Value Distribution")
+                                activation_dist = gr.Plot()
+                        def search_feature_labels(search_text, current_subject):
+                            if not search_text:
+                                return gr.CheckboxGroup(choices=[])
+                            matches = [f"{f['label']} ({f['index']})" for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
+                            return gr.CheckboxGroup(choices=matches[:10])
+                        feature_search.change(search_feature_labels, inputs=[feature_search, subject], outputs=[feature_matches])
+                        def on_visualize(selected_features, current_subject):
+                            if not selected_features:
+                                return "Please select a feature to visualize.", None, None, None, None, None, "", []
+                            # Extract the feature index from the selected feature string
+                            feature_index = int(selected_features[0].split('(')[-1].strip(')'))
+                            feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist = visualize_feature(current_subject, feature_index)
+                            # Return the visualization results along with empty values for search box and checkbox
+                            return feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, "", []
+                        visualize_button.click(
+                            on_visualize,
+                            inputs=[feature_matches, subject],
+                            outputs=[feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, feature_search, feature_matches]
                         )
+                        # with gr.Row():
+                        #     feature_search = gr.Textbox(label="Search Feature Labels")
+                        #     feature_matches = gr.CheckboxGroup(label="Matching Features", choices=[])
+                        #     visualize_button = gr.Button("Visualize Feature")
+                        # feature_info = gr.Markdown()
+                        # abstracts_heading = gr.Markdown("## Top 5 Abstracts")
+                        # top_abstracts = gr.Dataframe(
+                        #     headers=["Title", "Activation value"],
+                        #     datatype=["markdown", "number"],
+                        #     interactive=False,
+                        #     wrap=True
+                        # )
+                        # gr.Markdown("## Correlated Features")
+                        # with gr.Row():
+                        #     with gr.Column(scale=1):
+                        #         gr.Markdown("### Top 5 Correlated Features")
+                        #         top_correlated = gr.Dataframe(
+                        #             headers=["Feature", "Cosine similarity"],
+                        #             interactive=False
+                        #         )
+                        #     with gr.Column(scale=1):
+                        #         gr.Markdown("### Bottom 5 Correlated Features")
+                        #         bottom_correlated = gr.Dataframe(
+                        #             headers=["Feature", "Cosine similarity"],
+                        #             interactive=False
+                        #         )
+                        # with gr.Row():
+                        #     with gr.Column(scale=1):
+                        #         gr.Markdown("## Top 5 Co-occurring Features")
+                        #         co_occurring_features = gr.Dataframe(
+                        #             headers=["Feature", "Co-occurrences"],
+                        #             interactive=False
+                        #         )
+                        #     with gr.Column(scale=1):
+                        #         gr.Markdown(f"## Activation Value Distribution")
+                        #         activation_dist = gr.Plot()
+                    with gr.Tab("Feature Families"):
+                        gr.Markdown("# Feature Families")
+                        with gr.Row():
+                            family_search = gr.Textbox(label="Search Feature Families")
+                            family_matches = gr.CheckboxGroup(label="Matching Feature Families", choices=[])
+                            visualize_family_button = gr.Button("Visualize Feature Family")
+                        family_info = gr.Markdown()
+                        family_dataframe = gr.Dataframe(
+                            headers=["Feature", "F1 Score", "Pearson Correlation"],
+                            datatype=["markdown", "number", "number"],
+                            label="Family and Child Features"
+                        )
+                        # family_dataframe = gr.Dataframe(
+                        #     headers=["Feature", "F1 Score", "Pearson Correlation"],
+                        #     datatype=["str", "number", "number"],
+                        #     label="Family and Child Features"
+                        # )
+                        def search_feature_families(search_text, current_subject):
+                            family_analysis = subject_data[current_subject]['family_analysis']
+                            if not search_text:
+                                return gr.CheckboxGroup(choices=[])
+                            matches = [family['superfeature'] for family in family_analysis if search_text.lower() in family['superfeature'].lower()]
+                            return gr.CheckboxGroup(choices=matches[:10])  # Limit to top 10 matches
+                        # def visualize_feature_family(selected_families, current_subject):
+                        #     if not selected_families:
+                        #         return "Please select a feature family to visualize.", None
+                        #     selected_family = selected_families[0]  # Take the first selected family
+                        #     family_analysis = subject_data[current_subject]['family_analysis']
+                        #     family_data = next((family for family in family_analysis if family['superfeature'] == selected_family), None)
+                        #     if not family_data:
+                        #         return "Invalid feature family selected.", None
+                        #     output = f"# {family_data['superfeature']}\n\n"
+                        #     output += f"## Super Reasoning\n{family_data['super_reasoning']}\n\n"
+                        #     # Create DataFrame
+                        #     df_data = [
+                        #         {
+                        #             "Feature": family_data['superfeature'],
+                        #             "F1 Score": family_data['family_f1'],
+                        #             "Pearson Correlation": family_data['family_pearson']
+                        #         }
+                        #     ]
+                        #     for name, f1, pearson in zip(family_data['feature_names'], family_data['feature_f1'], family_data['feature_pearson']):
+                        #         df_data.append({
+                        #             "Feature": name,
+                        #             "F1 Score": f1,
+                        #             "Pearson Correlation": pearson
+                        #         })
+                        #     df = pd.DataFrame(df_data)
+                        #     return output, df
+                        # def visualize_feature_family(selected_families, current_subject):
+                        #     if not selected_families:
+                        #         return "Please select a feature family to visualize.", None, "", []
+                        #     selected_family = selected_families[0]  # Take the first selected family
+                        #     family_analysis = subject_data[current_subject]['family_analysis']
+                        #     family_data = next((family for family in family_analysis if family['superfeature'] == selected_family), None)
+                        #     if not family_data:
+                        #         return "Invalid feature family selected.", None, "", []
+                        #     output = f"# {family_data['superfeature']}\n\n"
+                        #     output += f"## Super Reasoning\n{family_data['super_reasoning']}\n\n"
+                        #     # Create DataFrame
+                        #     df_data = [
+                        #         {
+                        #             "Feature": family_data['superfeature'],
+                        #             "F1 Score": family_data['family_f1'],
+                        #             "Pearson Correlation": family_data['family_pearson']
+                        #         }
+                        #     ]
+                        #     for name, f1, pearson in zip(family_data['feature_names'], family_data['feature_f1'], family_data['feature_pearson']):
+                        #         df_data.append({
+                        #             "Feature": name,
+                        #             "F1 Score": f1,
+                        #             "Pearson Correlation": pearson
+                        #         })
+                        #     df = pd.DataFrame(df_data)
+                        #     return output, df, "", []  # Return empty string for search box and empty list for checkbox
+                        def visualize_feature_family(selected_families, current_subject):
+                            if not selected_families:
+                                return "Please select a feature family to visualize.", None, "", []
+                            selected_family = selected_families[0]  # Take the first selected family
+                            family_analysis = subject_data[current_subject]['family_analysis']
+                            family_data = next((family for family in family_analysis if family['superfeature'] == selected_family), None)
+                            if not family_data:
+                                return "Invalid feature family selected.", None, "", []
+                            output = f"# {family_data['superfeature']}\n\n"
+                            # Create DataFrame
+                            df_data = [
+                                {
+                                    "Feature": f"## {family_data['superfeature']}",
+                                    "F1 Score": round(family_data['family_f1'], 2),
+                                    "Pearson Correlation": round(family_data['family_pearson'], 4)
+                                },
+                                # {
+                                #     "Feature": "## Child Features",
+                                #     "F1 Score": None,
+                                #     "Pearson Correlation": None
+                                # }
+                            ]
+                            for name, f1, pearson in zip(family_data['feature_names'], family_data['feature_f1'], family_data['feature_pearson']):
+                                df_data.append({
+                                    "Feature": name,
+                                    "F1 Score": round(f1, 2),
+                                    "Pearson Correlation": round(pearson, 4)
+                                })
+                            df = pd.DataFrame(df_data)
+                            # Add super reasoning below the dataframe
+                            output += "## Super Reasoning\n"
+                            output += f"{family_data['super_reasoning']}\n\n"
+                            return output, df, "", []  # Return empty string for search box and empty list for checkbox
+                        family_search.change(search_feature_families, inputs=[family_search, subject], outputs=[family_matches])
+                        visualize_family_button.click(
+                            visualize_feature_family,
+                            inputs=[family_matches, subject],
+                            outputs=[family_info, family_dataframe, family_search, family_matches]
                         )
         # Add logic to update components when subject changes
         def on_subject_change(new_subject):
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch()