saridormi commited on
Commit
216d66f
β€’
1 Parent(s): 6ed6ff5

render readme in the app, fix examples sliders, return plots (one for now)

Browse files
README.md CHANGED
@@ -9,10 +9,17 @@ app_file: change_visualizer.py
9
  # Commit Message Editing Visualisation βœοΈπŸ”πŸ“Š
10
 
11
  This space provides a visualization app for exploring the commit message edits datasets (πŸ€—[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and πŸ€—[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
12
- from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper as well as some important artifacts from our work.
13
 
14
  ## Artifacts
15
 
16
  * [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
17
  * [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
18
- * [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1.
 
 
 
 
 
 
 
 
9
  # Commit Message Editing Visualisation βœοΈπŸ”πŸ“Š
10
 
11
  This space provides a visualization app for exploring the commit message edits datasets (πŸ€—[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and πŸ€—[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
12
+ from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
13
 
14
  ## Artifacts
15
 
16
  * [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
17
  * [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
18
+ * [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
19
+ * [`generation_steps/synthetic_backward.py`](generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
20
+ * [`generation_steps/synthetic_forward.py`](generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
21
+
22
+ ## Visualization
23
+
24
+ * πŸ” Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
25
+ * πŸ“Š Click on `Dataset Statistics` tab to see the major statistics for our dataset.
change_visualizer.py CHANGED
@@ -1,54 +1,117 @@
1
  import gradio as gr
 
2
 
3
  import generate_annotated_diffs
 
 
 
 
4
 
5
- df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
6
- df_manual["end_to_start"] = False
7
- df_manual["start_to_end"] = False
8
- n_diffs_manual = len(df_manual)
9
 
10
- df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
11
- n_diffs_synthetic = len(df_synthetic)
12
 
13
 
14
  def golden():
15
- return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "expert_labeled")]
16
 
17
 
18
  def backward():
19
- return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "backward") & (df_synthetic['E_type'].isin(["synthetic_forward", "synthtetic_forward_from_backward"]))]
20
 
21
 
22
  def forward():
23
- return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "synthetic_forward")]
24
 
25
 
26
  def forward_from_backward():
27
- return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "synthetic_backward") & (df_synthetic['E_type'] == "synthetic_forward_from_backward")]
28
 
29
 
30
- def synthetic():
31
- return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
32
-
 
33
 
34
  def update_dataset_view(diff_idx, df):
35
  diff_idx -= 1
36
  return (df.iloc[diff_idx]['annotated_diff'],
37
  df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
38
  df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
39
- df.iloc[diff_idx]['session'] if "session" in df.columns else "",
40
- str(df.iloc[diff_idx]['end_to_start']) if "end_to_start" in df.columns else "",
41
- str(df.iloc[diff_idx]['start_to_end']) if "start_to_end" in df.columns else "",
42
  f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
43
 
44
 
45
  def update_dataset_view_manual(diff_idx):
46
- return update_dataset_view(diff_idx, df_manual)
47
-
48
-
49
- def update_dataset_view_synthetic(diff_idx):
50
- return update_dataset_view(diff_idx, df_synthetic)
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  force_light_theme_js_func = """
54
  function refresh() {
@@ -63,53 +126,73 @@ function refresh() {
63
 
64
  if __name__ == '__main__':
65
  with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
 
 
 
66
  def dataset_view_tab(n_items):
67
  slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
68
  label=f"Sample number (total: {n_items})")
69
 
70
  diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
71
- start_view = gr.Textbox(interactive=False, label="Start message", container=True)
72
- end_view = gr.Textbox(interactive=False, label="End message", container=True)
73
- session_view = gr.Textbox(interactive=False, label="Session", container=True)
74
- is_end_to_start_view = gr.Textbox(interactive=False,
75
- label="Is generated via backward synthetic generation?",
76
- container=True)
77
- is_start_to_end_view = gr.Textbox(interactive=False,
78
- label="Is generated via forward synthetic generation?",
79
- container=True)
80
  link_view = gr.Markdown()
81
 
82
  view = [
83
  diff_view,
84
  start_view,
85
  end_view,
86
- session_view,
87
- is_end_to_start_view,
88
- is_start_to_end_view,
89
  link_view
90
  ]
91
 
92
  return slider, view
93
 
 
 
 
 
 
 
 
94
 
95
- with gr.Tab("Manual"):
96
- slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
97
 
98
- slider_manual.change(update_dataset_view_manual,
99
- inputs=slider_manual,
100
- outputs=view_manual)
101
 
102
- with gr.Tab("Synthetic"):
103
- slider_synthetic, view_synthetic = dataset_view_tab(n_diffs_synthetic)
104
 
105
- slider_synthetic.change(update_dataset_view_synthetic,
106
- inputs=slider_synthetic,
107
- outputs=view_synthetic)
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  application.load(update_dataset_view_manual, inputs=slider_manual,
110
  outputs=view_manual)
111
 
112
- application.load(update_dataset_view_synthetic, inputs=slider_synthetic,
113
- outputs=view_synthetic)
 
 
 
 
 
 
 
 
114
 
115
  application.launch()
 
1
  import gradio as gr
2
+ import pandas as pd
3
 
4
  import generate_annotated_diffs
5
+ from evaluate.utils import parse_readme
6
+ import numpy as np
7
+ from scipy.stats import gaussian_kde
8
+ import plotly.graph_objects as go
9
 
10
+ from generation_steps.metrics_analysis import edit_distance_fn
 
 
 
11
 
12
+ df = generate_annotated_diffs.data_with_annotated_diffs()
 
13
 
14
 
15
  def golden():
16
+ return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
17
 
18
 
19
  def backward():
20
+ return df.loc[(df['G_type'] == "synthetic_backward") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
21
 
22
 
23
  def forward():
24
+ return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "synthetic_forward")].reset_index(drop=True)
25
 
26
 
27
  def forward_from_backward():
28
+ return df.loc[(df.G_type == "synthetic_backward") & (df.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True)
29
 
30
 
31
+ n_diffs_manual = len(golden())
32
+ n_diffs_synthetic_backward = len(backward())
33
+ n_diffs_synthetic_forward = len(forward())
34
+ n_diffs_synthetic_forward_backward = len(forward_from_backward())
35
 
36
  def update_dataset_view(diff_idx, df):
37
  diff_idx -= 1
38
  return (df.iloc[diff_idx]['annotated_diff'],
39
  df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
40
  df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
 
 
 
41
  f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
42
 
43
 
44
  def update_dataset_view_manual(diff_idx):
45
+ return update_dataset_view(diff_idx, golden())
46
+
47
+
48
+ def update_dataset_view_synthetic_backward(diff_idx):
49
+ return update_dataset_view(diff_idx, backward())
50
+
51
+
52
+ def update_dataset_view_synthetic_forward(diff_idx):
53
+ return update_dataset_view(diff_idx, forward())
54
+
55
+ def update_dataset_view_synthetic_forward_backward(diff_idx):
56
+ return update_dataset_view(diff_idx, forward_from_backward())
57
+
58
+
59
+ def edit_distance_plot():
60
+ df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
61
+ df.iterrows()],
62
+ "Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
63
+ _, row in backward().iterrows()],
64
+ "Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
65
+ _, row in pd.concat([forward(), forward_from_backward()], axis=0,
66
+ ignore_index=True).iterrows()],
67
+ "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
68
+ _, row in golden().iterrows()]
69
+ }
70
+
71
+ colors = {"Expert-labeled": "#C19C0B",
72
+ "Synthetic Backward": "#913632",
73
+ "Synthetic Forward": "#58136a",
74
+ "Full": "#000000"}
75
+ traces = []
76
+
77
+ for key in df_edit_distance:
78
+ kde_x = np.linspace(0, 1200, 1000)
79
+ kde = gaussian_kde(df_edit_distance[key])
80
+ kde_line = go.Scatter(
81
+ x=kde_x,
82
+ y=kde(kde_x),
83
+ mode='lines',
84
+ name=key,
85
+ line=dict(color=colors[key], width=5)
86
+ )
87
+ traces.append(kde_line)
88
+
89
+ fig = go.Figure(data=traces)
90
+
91
+ fig.update_layout(
92
+ bargap=0.1,
93
+ xaxis=dict(
94
+ title=dict(text="Edit Distance", font=dict(size=30)),
95
+ range=[0, 1200],
96
+ showgrid=True,
97
+ gridcolor='lightgrey'
98
+ ),
99
+ yaxis=dict(
100
+ title=dict(text="Probability Density", font=dict(size=30)),
101
+ range=[0, 0.004],
102
+ showgrid=True,
103
+ gridcolor='lightgrey',
104
+ tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
105
+ tickformat=".4f"
106
+ ),
107
+ plot_bgcolor='rgba(0,0,0,0)',
108
+ paper_bgcolor='rgba(0,0,0,0)',
109
+ font=dict(size=24),
110
+ legend=dict(font=dict(size=30)),
111
+ width=1600,
112
+ height=600,
113
+ )
114
+ return fig
115
 
116
  force_light_theme_js_func = """
117
  function refresh() {
 
126
 
127
  if __name__ == '__main__':
128
  with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
129
+
130
+ gr.Markdown(parse_readme("README.md"))
131
+
132
  def dataset_view_tab(n_items):
133
  slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
134
  label=f"Sample number (total: {n_items})")
135
 
136
  diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
137
+ start_view = gr.Textbox(interactive=False, label="Initial message G", container=True)
138
+ end_view = gr.Textbox(interactive=False, label="Edited message E", container=True)
 
 
 
 
 
 
 
139
  link_view = gr.Markdown()
140
 
141
  view = [
142
  diff_view,
143
  start_view,
144
  end_view,
 
 
 
145
  link_view
146
  ]
147
 
148
  return slider, view
149
 
150
+ with gr.Tab("Examples Exploration"):
151
+ with gr.Tab("Manual"):
152
+ slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
153
+
154
+ slider_manual.change(update_dataset_view_manual,
155
+ inputs=slider_manual,
156
+ outputs=view_manual)
157
 
158
+ with gr.Tab("Synthetic Backward"):
159
+ slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward)
160
 
161
+ slider_synthetic_backward.change(update_dataset_view_synthetic_backward,
162
+ inputs=slider_synthetic_backward,
163
+ outputs=view_synthetic_backward)
164
 
165
+ with gr.Tab("Synthetic Forward (from initial)"):
166
+ slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward)
167
 
168
+ slider_synthetic_forward.change(update_dataset_view_synthetic_forward,
169
+ inputs=slider_synthetic_forward,
170
+ outputs=view_synthetic_forward)
171
+
172
+ with gr.Tab("Synthetic Forward (from backward)"):
173
+ slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(n_diffs_synthetic_forward_backward)
174
+
175
+ slider_synthetic_forward_backward.change(update_dataset_view_synthetic_forward_backward,
176
+ inputs=slider_synthetic_forward_backward,
177
+ outputs=view_synthetic_forward_backward)
178
+
179
+ with gr.Tab("Dataset Statistics"):
180
+ gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
181
+
182
+ edit_distance_gr_plot = gr.Plot()
183
 
184
  application.load(update_dataset_view_manual, inputs=slider_manual,
185
  outputs=view_manual)
186
 
187
+ application.load(update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward,
188
+ outputs=view_synthetic_backward)
189
+
190
+ application.load(update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward,
191
+ outputs=view_synthetic_forward)
192
+
193
+ application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
194
+ outputs=view_synthetic_forward_backward)
195
+
196
+ application.load(edit_distance_plot, outputs=edit_distance_gr_plot)
197
 
198
  application.launch()
generate_annotated_diffs.py CHANGED
@@ -32,19 +32,11 @@ def annotated_diff_for_row(row):
32
  return get_annotated_diff(start, end)
33
 
34
 
35
- def manual_data_with_annotated_diffs():
36
- tqdm.pandas()
37
-
38
- df = hf_data_loader.load_raw_rewriting_as_pandas()
39
- annotated = df.progress_apply(annotated_diff_for_row, axis=1)
40
- df['annotated_diff'] = annotated
41
- return df
42
-
43
-
44
- def synthetic_data_with_annotated_diffs():
45
  tqdm.pandas()
46
 
47
  df = hf_data_loader.load_synthetic_as_pandas()
 
48
  annotated = df.progress_apply(annotated_diff_for_row, axis=1)
49
  df['annotated_diff'] = annotated
50
  return df
 
32
  return get_annotated_diff(start, end)
33
 
34
 
35
+ def data_with_annotated_diffs():
 
 
 
 
 
 
 
 
 
36
  tqdm.pandas()
37
 
38
  df = hf_data_loader.load_synthetic_as_pandas()
39
+ df = df.loc[df.is_related].copy()
40
  annotated = df.progress_apply(annotated_diff_for_row, axis=1)
41
  df['annotated_diff'] = annotated
42
  return df
generation_steps/for_labeling.py CHANGED
@@ -4,12 +4,12 @@ from tqdm import tqdm
4
 
5
  import config
6
  from api_wrappers import hf_data_loader
7
- from generation_steps import synthetic_start_to_end
8
 
9
 
10
  def transform(df):
11
  print(f"Generating data for labeling:")
12
- synthetic_start_to_end.print_config()
13
  tqdm.pandas()
14
 
15
  manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
@@ -36,7 +36,7 @@ def transform(df):
36
  commit_id = (row['hash'], row['repo'])
37
  if row['manual_sample']:
38
  return manual_df.loc[commit_id]['commit_msg_end']
39
- return synthetic_start_to_end.generate_end_msg(start_msg=row["prediction"],
40
  diff=row["mods"])
41
 
42
  result['enhanced'] = result.progress_apply(get_enhanced_message, axis=1)
@@ -49,7 +49,7 @@ def transform(df):
49
 
50
 
51
  def main():
52
- synthetic_start_to_end.GENERATION_ATTEMPTS = 3
53
  df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
54
  transform(df)
55
 
 
4
 
5
  import config
6
  from api_wrappers import hf_data_loader
7
+ from generation_steps import synthetic_forward
8
 
9
 
10
  def transform(df):
11
  print(f"Generating data for labeling:")
12
+ synthetic_forward.print_config()
13
  tqdm.pandas()
14
 
15
  manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
 
36
  commit_id = (row['hash'], row['repo'])
37
  if row['manual_sample']:
38
  return manual_df.loc[commit_id]['commit_msg_end']
39
+ return synthetic_forward.generate_end_msg(start_msg=row["prediction"],
40
  diff=row["mods"])
41
 
42
  result['enhanced'] = result.progress_apply(get_enhanced_message, axis=1)
 
49
 
50
 
51
  def main():
52
+ synthetic_forward.GENERATION_ATTEMPTS = 3
53
  df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
54
  transform(df)
55