render readme in the app, fix examples sliders, return plots (one for now)
Browse files- README.md +9 -2
- change_visualizer.py +129 -46
- generate_annotated_diffs.py +2 -10
- generation_steps/for_labeling.py +4 -4
README.md
CHANGED
@@ -9,10 +9,17 @@ app_file: change_visualizer.py
|
|
9 |
# Commit Message Editing Visualisation βοΈππ
|
10 |
|
11 |
This space provides a visualization app for exploring the commit message edits datasets (π€[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and π€[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
|
12 |
-
from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper
|
13 |
|
14 |
## Artifacts
|
15 |
|
16 |
* [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
|
17 |
* [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
|
18 |
-
* [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# Commit Message Editing Visualisation βοΈππ
|
10 |
|
11 |
This space provides a visualization app for exploring the commit message edits datasets (π€[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and π€[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
|
12 |
+
from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
|
13 |
|
14 |
## Artifacts
|
15 |
|
16 |
* [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
|
17 |
* [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
|
18 |
+
* [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
|
19 |
+
* [`generation_steps/synthetic_backward.py`](generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
|
20 |
+
* [`generation_steps/synthetic_forward.py`](generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
|
21 |
+
|
22 |
+
## Visualization
|
23 |
+
|
24 |
+
* π Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
|
25 |
+
* π Click on `Dataset Statistics` tab to see the major statistics for our dataset.
|
change_visualizer.py
CHANGED
@@ -1,54 +1,117 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
import generate_annotated_diffs
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
df_manual["end_to_start"] = False
|
7 |
-
df_manual["start_to_end"] = False
|
8 |
-
n_diffs_manual = len(df_manual)
|
9 |
|
10 |
-
|
11 |
-
n_diffs_synthetic = len(df_synthetic)
|
12 |
|
13 |
|
14 |
def golden():
|
15 |
-
return
|
16 |
|
17 |
|
18 |
def backward():
|
19 |
-
return
|
20 |
|
21 |
|
22 |
def forward():
|
23 |
-
return
|
24 |
|
25 |
|
26 |
def forward_from_backward():
|
27 |
-
return
|
28 |
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
|
34 |
def update_dataset_view(diff_idx, df):
|
35 |
diff_idx -= 1
|
36 |
return (df.iloc[diff_idx]['annotated_diff'],
|
37 |
df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
|
38 |
df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
|
39 |
-
df.iloc[diff_idx]['session'] if "session" in df.columns else "",
|
40 |
-
str(df.iloc[diff_idx]['end_to_start']) if "end_to_start" in df.columns else "",
|
41 |
-
str(df.iloc[diff_idx]['start_to_end']) if "start_to_end" in df.columns else "",
|
42 |
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
|
43 |
|
44 |
|
45 |
def update_dataset_view_manual(diff_idx):
|
46 |
-
return update_dataset_view(diff_idx,
|
47 |
-
|
48 |
-
|
49 |
-
def
|
50 |
-
return update_dataset_view(diff_idx,
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
force_light_theme_js_func = """
|
54 |
function refresh() {
|
@@ -63,53 +126,73 @@ function refresh() {
|
|
63 |
|
64 |
if __name__ == '__main__':
|
65 |
with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
|
|
|
|
|
|
|
66 |
def dataset_view_tab(n_items):
|
67 |
slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
|
68 |
label=f"Sample number (total: {n_items})")
|
69 |
|
70 |
diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
|
71 |
-
start_view = gr.Textbox(interactive=False, label="
|
72 |
-
end_view = gr.Textbox(interactive=False, label="
|
73 |
-
session_view = gr.Textbox(interactive=False, label="Session", container=True)
|
74 |
-
is_end_to_start_view = gr.Textbox(interactive=False,
|
75 |
-
label="Is generated via backward synthetic generation?",
|
76 |
-
container=True)
|
77 |
-
is_start_to_end_view = gr.Textbox(interactive=False,
|
78 |
-
label="Is generated via forward synthetic generation?",
|
79 |
-
container=True)
|
80 |
link_view = gr.Markdown()
|
81 |
|
82 |
view = [
|
83 |
diff_view,
|
84 |
start_view,
|
85 |
end_view,
|
86 |
-
session_view,
|
87 |
-
is_end_to_start_view,
|
88 |
-
is_start_to_end_view,
|
89 |
link_view
|
90 |
]
|
91 |
|
92 |
return slider, view
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
110 |
outputs=view_manual)
|
111 |
|
112 |
-
application.load(
|
113 |
-
outputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
application.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
|
4 |
import generate_annotated_diffs
|
5 |
+
from evaluate.utils import parse_readme
|
6 |
+
import numpy as np
|
7 |
+
from scipy.stats import gaussian_kde
|
8 |
+
import plotly.graph_objects as go
|
9 |
|
10 |
+
from generation_steps.metrics_analysis import edit_distance_fn
|
|
|
|
|
|
|
11 |
|
12 |
+
df = generate_annotated_diffs.data_with_annotated_diffs()
|
|
|
13 |
|
14 |
|
15 |
def golden():
|
16 |
+
return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
|
17 |
|
18 |
|
19 |
def backward():
|
20 |
+
return df.loc[(df['G_type'] == "synthetic_backward") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
|
21 |
|
22 |
|
23 |
def forward():
|
24 |
+
return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "synthetic_forward")].reset_index(drop=True)
|
25 |
|
26 |
|
27 |
def forward_from_backward():
|
28 |
+
return df.loc[(df.G_type == "synthetic_backward") & (df.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True)
|
29 |
|
30 |
|
31 |
+
n_diffs_manual = len(golden())
|
32 |
+
n_diffs_synthetic_backward = len(backward())
|
33 |
+
n_diffs_synthetic_forward = len(forward())
|
34 |
+
n_diffs_synthetic_forward_backward = len(forward_from_backward())
|
35 |
|
36 |
def update_dataset_view(diff_idx, df):
|
37 |
diff_idx -= 1
|
38 |
return (df.iloc[diff_idx]['annotated_diff'],
|
39 |
df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
|
40 |
df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
|
|
|
|
|
|
|
41 |
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
|
42 |
|
43 |
|
44 |
def update_dataset_view_manual(diff_idx):
|
45 |
+
return update_dataset_view(diff_idx, golden())
|
46 |
+
|
47 |
+
|
48 |
+
def update_dataset_view_synthetic_backward(diff_idx):
|
49 |
+
return update_dataset_view(diff_idx, backward())
|
50 |
+
|
51 |
+
|
52 |
+
def update_dataset_view_synthetic_forward(diff_idx):
|
53 |
+
return update_dataset_view(diff_idx, forward())
|
54 |
+
|
55 |
+
def update_dataset_view_synthetic_forward_backward(diff_idx):
|
56 |
+
return update_dataset_view(diff_idx, forward_from_backward())
|
57 |
+
|
58 |
+
|
59 |
+
def edit_distance_plot():
|
60 |
+
df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
|
61 |
+
df.iterrows()],
|
62 |
+
"Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
63 |
+
_, row in backward().iterrows()],
|
64 |
+
"Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
65 |
+
_, row in pd.concat([forward(), forward_from_backward()], axis=0,
|
66 |
+
ignore_index=True).iterrows()],
|
67 |
+
"Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
68 |
+
_, row in golden().iterrows()]
|
69 |
+
}
|
70 |
+
|
71 |
+
colors = {"Expert-labeled": "#C19C0B",
|
72 |
+
"Synthetic Backward": "#913632",
|
73 |
+
"Synthetic Forward": "#58136a",
|
74 |
+
"Full": "#000000"}
|
75 |
+
traces = []
|
76 |
+
|
77 |
+
for key in df_edit_distance:
|
78 |
+
kde_x = np.linspace(0, 1200, 1000)
|
79 |
+
kde = gaussian_kde(df_edit_distance[key])
|
80 |
+
kde_line = go.Scatter(
|
81 |
+
x=kde_x,
|
82 |
+
y=kde(kde_x),
|
83 |
+
mode='lines',
|
84 |
+
name=key,
|
85 |
+
line=dict(color=colors[key], width=5)
|
86 |
+
)
|
87 |
+
traces.append(kde_line)
|
88 |
+
|
89 |
+
fig = go.Figure(data=traces)
|
90 |
+
|
91 |
+
fig.update_layout(
|
92 |
+
bargap=0.1,
|
93 |
+
xaxis=dict(
|
94 |
+
title=dict(text="Edit Distance", font=dict(size=30)),
|
95 |
+
range=[0, 1200],
|
96 |
+
showgrid=True,
|
97 |
+
gridcolor='lightgrey'
|
98 |
+
),
|
99 |
+
yaxis=dict(
|
100 |
+
title=dict(text="Probability Density", font=dict(size=30)),
|
101 |
+
range=[0, 0.004],
|
102 |
+
showgrid=True,
|
103 |
+
gridcolor='lightgrey',
|
104 |
+
tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
|
105 |
+
tickformat=".4f"
|
106 |
+
),
|
107 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
108 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
109 |
+
font=dict(size=24),
|
110 |
+
legend=dict(font=dict(size=30)),
|
111 |
+
width=1600,
|
112 |
+
height=600,
|
113 |
+
)
|
114 |
+
return fig
|
115 |
|
116 |
force_light_theme_js_func = """
|
117 |
function refresh() {
|
|
|
126 |
|
127 |
if __name__ == '__main__':
|
128 |
with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
|
129 |
+
|
130 |
+
gr.Markdown(parse_readme("README.md"))
|
131 |
+
|
132 |
def dataset_view_tab(n_items):
|
133 |
slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
|
134 |
label=f"Sample number (total: {n_items})")
|
135 |
|
136 |
diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
|
137 |
+
start_view = gr.Textbox(interactive=False, label="Initial message G", container=True)
|
138 |
+
end_view = gr.Textbox(interactive=False, label="Edited message E", container=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
link_view = gr.Markdown()
|
140 |
|
141 |
view = [
|
142 |
diff_view,
|
143 |
start_view,
|
144 |
end_view,
|
|
|
|
|
|
|
145 |
link_view
|
146 |
]
|
147 |
|
148 |
return slider, view
|
149 |
|
150 |
+
with gr.Tab("Examples Exploration"):
|
151 |
+
with gr.Tab("Manual"):
|
152 |
+
slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
|
153 |
+
|
154 |
+
slider_manual.change(update_dataset_view_manual,
|
155 |
+
inputs=slider_manual,
|
156 |
+
outputs=view_manual)
|
157 |
|
158 |
+
with gr.Tab("Synthetic Backward"):
|
159 |
+
slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward)
|
160 |
|
161 |
+
slider_synthetic_backward.change(update_dataset_view_synthetic_backward,
|
162 |
+
inputs=slider_synthetic_backward,
|
163 |
+
outputs=view_synthetic_backward)
|
164 |
|
165 |
+
with gr.Tab("Synthetic Forward (from initial)"):
|
166 |
+
slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward)
|
167 |
|
168 |
+
slider_synthetic_forward.change(update_dataset_view_synthetic_forward,
|
169 |
+
inputs=slider_synthetic_forward,
|
170 |
+
outputs=view_synthetic_forward)
|
171 |
+
|
172 |
+
with gr.Tab("Synthetic Forward (from backward)"):
|
173 |
+
slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(n_diffs_synthetic_forward_backward)
|
174 |
+
|
175 |
+
slider_synthetic_forward_backward.change(update_dataset_view_synthetic_forward_backward,
|
176 |
+
inputs=slider_synthetic_forward_backward,
|
177 |
+
outputs=view_synthetic_forward_backward)
|
178 |
+
|
179 |
+
with gr.Tab("Dataset Statistics"):
|
180 |
+
gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
|
181 |
+
|
182 |
+
edit_distance_gr_plot = gr.Plot()
|
183 |
|
184 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
185 |
outputs=view_manual)
|
186 |
|
187 |
+
application.load(update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward,
|
188 |
+
outputs=view_synthetic_backward)
|
189 |
+
|
190 |
+
application.load(update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward,
|
191 |
+
outputs=view_synthetic_forward)
|
192 |
+
|
193 |
+
application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
|
194 |
+
outputs=view_synthetic_forward_backward)
|
195 |
+
|
196 |
+
application.load(edit_distance_plot, outputs=edit_distance_gr_plot)
|
197 |
|
198 |
application.launch()
|
generate_annotated_diffs.py
CHANGED
@@ -32,19 +32,11 @@ def annotated_diff_for_row(row):
|
|
32 |
return get_annotated_diff(start, end)
|
33 |
|
34 |
|
35 |
-
def
|
36 |
-
tqdm.pandas()
|
37 |
-
|
38 |
-
df = hf_data_loader.load_raw_rewriting_as_pandas()
|
39 |
-
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
40 |
-
df['annotated_diff'] = annotated
|
41 |
-
return df
|
42 |
-
|
43 |
-
|
44 |
-
def synthetic_data_with_annotated_diffs():
|
45 |
tqdm.pandas()
|
46 |
|
47 |
df = hf_data_loader.load_synthetic_as_pandas()
|
|
|
48 |
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
49 |
df['annotated_diff'] = annotated
|
50 |
return df
|
|
|
32 |
return get_annotated_diff(start, end)
|
33 |
|
34 |
|
35 |
+
def data_with_annotated_diffs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
tqdm.pandas()
|
37 |
|
38 |
df = hf_data_loader.load_synthetic_as_pandas()
|
39 |
+
df = df.loc[df.is_related].copy()
|
40 |
annotated = df.progress_apply(annotated_diff_for_row, axis=1)
|
41 |
df['annotated_diff'] = annotated
|
42 |
return df
|
generation_steps/for_labeling.py
CHANGED
@@ -4,12 +4,12 @@ from tqdm import tqdm
|
|
4 |
|
5 |
import config
|
6 |
from api_wrappers import hf_data_loader
|
7 |
-
from generation_steps import
|
8 |
|
9 |
|
10 |
def transform(df):
|
11 |
print(f"Generating data for labeling:")
|
12 |
-
|
13 |
tqdm.pandas()
|
14 |
|
15 |
manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
|
@@ -36,7 +36,7 @@ def transform(df):
|
|
36 |
commit_id = (row['hash'], row['repo'])
|
37 |
if row['manual_sample']:
|
38 |
return manual_df.loc[commit_id]['commit_msg_end']
|
39 |
-
return
|
40 |
diff=row["mods"])
|
41 |
|
42 |
result['enhanced'] = result.progress_apply(get_enhanced_message, axis=1)
|
@@ -49,7 +49,7 @@ def transform(df):
|
|
49 |
|
50 |
|
51 |
def main():
|
52 |
-
|
53 |
df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
|
54 |
transform(df)
|
55 |
|
|
|
4 |
|
5 |
import config
|
6 |
from api_wrappers import hf_data_loader
|
7 |
+
from generation_steps import synthetic_forward
|
8 |
|
9 |
|
10 |
def transform(df):
|
11 |
print(f"Generating data for labeling:")
|
12 |
+
synthetic_forward.print_config()
|
13 |
tqdm.pandas()
|
14 |
|
15 |
manual_df = hf_data_loader.load_raw_rewriting_as_pandas()
|
|
|
36 |
commit_id = (row['hash'], row['repo'])
|
37 |
if row['manual_sample']:
|
38 |
return manual_df.loc[commit_id]['commit_msg_end']
|
39 |
+
return synthetic_forward.generate_end_msg(start_msg=row["prediction"],
|
40 |
diff=row["mods"])
|
41 |
|
42 |
result['enhanced'] = result.progress_apply(get_enhanced_message, axis=1)
|
|
|
49 |
|
50 |
|
51 |
def main():
|
52 |
+
synthetic_forward.GENERATION_ATTEMPTS = 3
|
53 |
df = hf_data_loader.load_full_commit_with_predictions_as_pandas()
|
54 |
transform(df)
|
55 |
|