Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

App Files Files Community

saridormi commited on 27 days ago

Commit

216d66f

•

1 Parent(s): 6ed6ff5

render readme in the app, fix examples sliders, return plots (one for now)

Browse files

Files changed (4) hide show

README.md +9 -2
change_visualizer.py +129 -46
generate_annotated_diffs.py +2 -10
generation_steps/for_labeling.py +4 -4

README.md CHANGED Viewed

@@ -9,10 +9,17 @@ app_file: change_visualizer.py
 # Commit Message Editing Visualisation ✍️🔍📊
 This space provides a visualization app for exploring the commit message edits datasets (🤗[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and 🤗[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
-from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper as well as some important artifacts from our work.
 ## Artifacts
 * [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
 * [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
-* [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1.

 # Commit Message Editing Visualisation ✍️🔍📊
 This space provides a visualization app for exploring the commit message edits datasets (🤗[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and 🤗[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
+from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
 ## Artifacts
 * [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
 * [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
+* [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
+* [`generation_steps/synthetic_backward.py`](generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
+* [`generation_steps/synthetic_forward.py`](generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
+## Visualization
+* 🔍 Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
+* 📊 Click on `Dataset Statistics` tab to see the major statistics for our dataset.

change_visualizer.py CHANGED Viewed

@@ -1,54 +1,117 @@
 import gradio as gr
 import generate_annotated_diffs
-df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
-df_manual["end_to_start"] = False
-df_manual["start_to_end"] = False
-n_diffs_manual = len(df_manual)
-df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
-n_diffs_synthetic = len(df_synthetic)
 def golden():
-    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "expert_labeled")]
 def backward():
-    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "backward") & (df_synthetic['E_type'].isin(["synthetic_forward", "synthtetic_forward_from_backward"]))]
 def forward():
-    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "synthetic_forward")]
 def forward_from_backward():
-    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "synthetic_backward") & (df_synthetic['E_type'] == "synthetic_forward_from_backward")]
-def synthetic():
-    return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
 def update_dataset_view(diff_idx, df):
     diff_idx -= 1
     return (df.iloc[diff_idx]['annotated_diff'],
             df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
             df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
-            df.iloc[diff_idx]['session'] if "session" in df.columns else "",
-            str(df.iloc[diff_idx]['end_to_start']) if "end_to_start" in df.columns else "",
-            str(df.iloc[diff_idx]['start_to_end']) if "start_to_end" in df.columns else "",
             f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
 def update_dataset_view_manual(diff_idx):
-    return update_dataset_view(diff_idx, df_manual)
-def update_dataset_view_synthetic(diff_idx):
-    return update_dataset_view(diff_idx, df_synthetic)
 force_light_theme_js_func = """
 function refresh() {
@@ -63,53 +126,73 @@ function refresh() {
 if __name__ == '__main__':
     with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
         def dataset_view_tab(n_items):
             slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
                                label=f"Sample number (total: {n_items})")
             diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
-            start_view = gr.Textbox(interactive=False, label="Start message", container=True)
-            end_view = gr.Textbox(interactive=False, label="End message", container=True)
-            session_view = gr.Textbox(interactive=False, label="Session", container=True)
-            is_end_to_start_view = gr.Textbox(interactive=False,
-                                              label="Is generated via backward synthetic generation?",
-                                              container=True)
-            is_start_to_end_view = gr.Textbox(interactive=False,
-                                              label="Is generated via forward synthetic generation?",
-                                              container=True)
             link_view = gr.Markdown()
             view = [
                 diff_view,
                 start_view,
                 end_view,
-                session_view,
-                is_end_to_start_view,
-                is_start_to_end_view,
                 link_view
             ]
             return slider, view
-        with gr.Tab("Manual"):
-            slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
-            slider_manual.change(update_dataset_view_manual,
-                                 inputs=slider_manual,
-                                 outputs=view_manual)
-        with gr.Tab("Synthetic"):
-            slider_synthetic, view_synthetic = dataset_view_tab(n_diffs_synthetic)
-            slider_synthetic.change(update_dataset_view_synthetic,
-                                    inputs=slider_synthetic,
-                                    outputs=view_synthetic)
         application.load(update_dataset_view_manual, inputs=slider_manual,
                          outputs=view_manual)
-        application.load(update_dataset_view_synthetic, inputs=slider_synthetic,
-                         outputs=view_synthetic)
     application.launch()

 import gradio as gr
+import pandas as pd
 import generate_annotated_diffs
+from evaluate.utils import parse_readme
+import numpy as np
+from scipy.stats import gaussian_kde
+import plotly.graph_objects as go
+from generation_steps.metrics_analysis import edit_distance_fn
+df = generate_annotated_diffs.data_with_annotated_diffs()
 def golden():
+    return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
 def backward():
+    return df.loc[(df['G_type'] == "synthetic_backward") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
 def forward():
+    return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "synthetic_forward")].reset_index(drop=True)
 def forward_from_backward():
+    return df.loc[(df.G_type == "synthetic_backward") & (df.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True)
+n_diffs_manual = len(golden())
+n_diffs_synthetic_backward = len(backward())
+n_diffs_synthetic_forward = len(forward())
+n_diffs_synthetic_forward_backward = len(forward_from_backward())
 def update_dataset_view(diff_idx, df):
     diff_idx -= 1
     return (df.iloc[diff_idx]['annotated_diff'],
             df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
             df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
             f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
 def update_dataset_view_manual(diff_idx):
+    return update_dataset_view(diff_idx, golden())
+def update_dataset_view_synthetic_backward(diff_idx):
+    return update_dataset_view(diff_idx, backward())
+def update_dataset_view_synthetic_forward(diff_idx):
+    return update_dataset_view(diff_idx, forward())
+def update_dataset_view_synthetic_forward_backward(diff_idx):
+    return update_dataset_view(diff_idx, forward_from_backward())
+def edit_distance_plot():
+    df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
+                                 df.iterrows()],
+                        "Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
+                                               _, row in backward().iterrows()],
+                        "Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
+                                              _, row in pd.concat([forward(), forward_from_backward()], axis=0,
+                                                                  ignore_index=True).iterrows()],
+                        "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
+                                           _, row in golden().iterrows()]
+                        }
+    colors = {"Expert-labeled": "#C19C0B",
+              "Synthetic Backward": "#913632",
+              "Synthetic Forward": "#58136a",
+              "Full": "#000000"}
+    traces = []
+    for key in df_edit_distance:
+        kde_x = np.linspace(0, 1200, 1000)
+        kde = gaussian_kde(df_edit_distance[key])
+        kde_line = go.Scatter(
+            x=kde_x,
+            y=kde(kde_x),
+            mode='lines',
+            name=key,
+            line=dict(color=colors[key], width=5)
+        )
+        traces.append(kde_line)
+    fig = go.Figure(data=traces)
+    fig.update_layout(
+        bargap=0.1,
+        xaxis=dict(
+            title=dict(text="Edit Distance", font=dict(size=30)),
+            range=[0, 1200],
+            showgrid=True,
+            gridcolor='lightgrey'
+        ),
+        yaxis=dict(
+            title=dict(text="Probability Density", font=dict(size=30)),
+            range=[0, 0.004],
+            showgrid=True,
+            gridcolor='lightgrey',
+            tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
+            tickformat=".4f"
+        ),
+        plot_bgcolor='rgba(0,0,0,0)',
+        paper_bgcolor='rgba(0,0,0,0)',
+        font=dict(size=24),
+        legend=dict(font=dict(size=30)),
+        width=1600,
+        height=600,
+    )
+    return fig
 force_light_theme_js_func = """
 function refresh() {
 if __name__ == '__main__':
     with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
+        gr.Markdown(parse_readme("README.md"))
         def dataset_view_tab(n_items):
             slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
                                label=f"Sample number (total: {n_items})")
             diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
+            start_view = gr.Textbox(interactive=False, label="Initial message G", container=True)
+            end_view = gr.Textbox(interactive=False, label="Edited message E", container=True)
             link_view = gr.Markdown()
             view = [
                 diff_view,
                 start_view,
                 end_view,
                 link_view
             ]
             return slider, view
+        with gr.Tab("Examples Exploration"):
+            with gr.Tab("Manual"):
+                slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
+                slider_manual.change(update_dataset_view_manual,
+                                     inputs=slider_manual,
+                                     outputs=view_manual)
+            with gr.Tab("Synthetic Backward"):
+                slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward)
+                slider_synthetic_backward.change(update_dataset_view_synthetic_backward,
+                                        inputs=slider_synthetic_backward,
+                                        outputs=view_synthetic_backward)
+            with gr.Tab("Synthetic Forward (from initial)"):
+                slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward)
+                slider_synthetic_forward.change(update_dataset_view_synthetic_forward,
+                                        inputs=slider_synthetic_forward,
+                                        outputs=view_synthetic_forward)
+            with gr.Tab("Synthetic Forward (from backward)"):
+                slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(n_diffs_synthetic_forward_backward)
+                slider_synthetic_forward_backward.change(update_dataset_view_synthetic_forward_backward,
+                                        inputs=slider_synthetic_forward_backward,
+                                        outputs=view_synthetic_forward_backward)
+        with gr.Tab("Dataset Statistics"):
+            gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
+            edit_distance_gr_plot = gr.Plot()
         application.load(update_dataset_view_manual, inputs=slider_manual,
                          outputs=view_manual)
+        application.load(update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward,
+                         outputs=view_synthetic_backward)
+        application.load(update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward,
+                         outputs=view_synthetic_forward)
+        application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
+                         outputs=view_synthetic_forward_backward)
+        application.load(edit_distance_plot, outputs=edit_distance_gr_plot)
     application.launch()

generate_annotated_diffs.py CHANGED Viewed

@@ -32,19 +32,11 @@ def annotated_diff_for_row(row):
     return get_annotated_diff(start, end)
-def manual_data_with_annotated_diffs():
-    tqdm.pandas()
-    df = hf_data_loader.load_raw_rewriting_as_pandas()
-    annotated = df.progress_apply(annotated_diff_for_row, axis=1)
-    df['annotated_diff'] = annotated
-    return df
-def synthetic_data_with_annotated_diffs():
     tqdm.pandas()
     df = hf_data_loader.load_synthetic_as_pandas()
     annotated = df.progress_apply(annotated_diff_for_row, axis=1)
     df['annotated_diff'] = annotated
     return df

     return get_annotated_diff(start, end)
+def data_with_annotated_diffs():
     tqdm.pandas()
     df = hf_data_loader.load_synthetic_as_pandas()
+    df = df.loc[df.is_related].copy()
     annotated = df.progress_apply(annotated_diff_for_row, axis=1)
     df['annotated_diff'] = annotated
     return df

generation_steps/for_labeling.py CHANGED Viewed

@@ -4,12 +4,12 @@ from tqdm import tqdm
 import config
 from api_wrappers import hf_data_loader
-from generation_steps import synthetic_start_to_end
 def transform(df):
     print(f"Generating data for labeling:")
-    synthetic_start_to_end.print_config()
     tqdm.pandas()
     manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
@@ -36,7 +36,7 @@ def transform(df):
         commit_id = (row['hash'], row['repo'])
         if row['manual_sample']:
             return manual_df.loc[commit_id]['commit_msg_end']
-        return synthetic_start_to_end.generate_end_msg(start_msg=row["prediction"],
                                                        diff=row["mods"])
     result['enhanced'] = result.progress_apply(get_enhanced_message, axis=1)
@@ -49,7 +49,7 @@ def transform(df):
 def main():
-    synthetic_start_to_end.GENERATION_ATTEMPTS = 3
     df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
     transform(df)

 import config
 from api_wrappers import hf_data_loader
+from generation_steps import synthetic_forward
 def transform(df):
     print(f"Generating data for labeling:")
+    synthetic_forward.print_config()
     tqdm.pandas()
     manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
         commit_id = (row['hash'], row['repo'])
         if row['manual_sample']:
             return manual_df.loc[commit_id]['commit_msg_end']
+        return synthetic_forward.generate_end_msg(start_msg=row["prediction"],
                                                        diff=row["mods"])
     result['enhanced'] = result.progress_apply(get_enhanced_message, axis=1)
 def main():
+    synthetic_forward.GENERATION_ATTEMPTS = 3
     df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
     transform(df)