add a couple of plots and prettify README
Browse files- README.md +8 -8
- change_visualizer.py +109 -20
README.md
CHANGED
@@ -8,18 +8,18 @@ app_file: change_visualizer.py
|
|
8 |
|
9 |
# Commit Message Editing Visualisation βοΈππ
|
10 |
|
11 |
-
This space provides a visualization app for exploring the commit message edits datasets (π€[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and π€[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
|
12 |
-
from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
|
13 |
|
14 |
## Artifacts
|
15 |
|
16 |
-
* [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
|
17 |
-
* [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
|
18 |
-
* [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
|
19 |
-
* [`generation_steps/synthetic_backward.py`](generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
|
20 |
-
* [`generation_steps/synthetic_forward.py`](generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
|
21 |
|
22 |
## Visualization
|
23 |
|
24 |
* π Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
|
25 |
-
*
|
|
|
8 |
|
9 |
# Commit Message Editing Visualisation βοΈππ
|
10 |
|
11 |
+
This space provides a visualization app for exploring the commit message edits datasets (π€ [expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and π€ [synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
|
12 |
+
from π [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
|
13 |
|
14 |
## Artifacts
|
15 |
|
16 |
+
* π[`metrics_analysis.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
|
17 |
+
* π[`chart.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/chart.ipynb) contains the code for Figure 4 with edit distance distribution;
|
18 |
+
* ποΈ[`data_stats.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
|
19 |
+
* β¬
οΈ[`generation_steps/synthetic_backward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
|
20 |
+
* β‘οΈ[`generation_steps/synthetic_forward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
|
21 |
|
22 |
## Visualization
|
23 |
|
24 |
* π Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
|
25 |
+
* π Click on `Dataset Statistics` tab to see the major statistics for our dataset.
|
change_visualizer.py
CHANGED
@@ -7,25 +7,32 @@ import numpy as np
|
|
7 |
from scipy.stats import gaussian_kde
|
8 |
import plotly.graph_objects as go
|
9 |
|
|
|
10 |
from generation_steps.metrics_analysis import edit_distance_fn
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
def golden():
|
16 |
-
return
|
17 |
|
18 |
|
19 |
def backward():
|
20 |
-
return
|
21 |
|
22 |
|
23 |
def forward():
|
24 |
-
return
|
25 |
|
26 |
|
27 |
def forward_from_backward():
|
28 |
-
return
|
29 |
|
30 |
|
31 |
n_diffs_manual = len(golden())
|
@@ -56,9 +63,96 @@ def update_dataset_view_synthetic_forward_backward(diff_idx):
|
|
56 |
return update_dataset_view(diff_idx, forward_from_backward())
|
57 |
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
def edit_distance_plot():
|
60 |
df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
|
61 |
-
|
62 |
"Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
63 |
_, row in backward().iterrows()],
|
64 |
"Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
@@ -67,11 +161,6 @@ def edit_distance_plot():
|
|
67 |
"Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
68 |
_, row in golden().iterrows()]
|
69 |
}
|
70 |
-
|
71 |
-
colors = {"Expert-labeled": "#C19C0B",
|
72 |
-
"Synthetic Backward": "#913632",
|
73 |
-
"Synthetic Forward": "#58136a",
|
74 |
-
"Full": "#000000"}
|
75 |
traces = []
|
76 |
|
77 |
for key in df_edit_distance:
|
@@ -91,13 +180,13 @@ def edit_distance_plot():
|
|
91 |
fig.update_layout(
|
92 |
bargap=0.1,
|
93 |
xaxis=dict(
|
94 |
-
title=dict(text="Edit Distance"
|
95 |
range=[0, 1200],
|
96 |
showgrid=True,
|
97 |
gridcolor='lightgrey'
|
98 |
),
|
99 |
yaxis=dict(
|
100 |
-
title=dict(text="Probability Density"
|
101 |
range=[0, 0.004],
|
102 |
showgrid=True,
|
103 |
gridcolor='lightgrey',
|
@@ -106,10 +195,7 @@ def edit_distance_plot():
|
|
106 |
),
|
107 |
plot_bgcolor='rgba(0,0,0,0)',
|
108 |
paper_bgcolor='rgba(0,0,0,0)',
|
109 |
-
|
110 |
-
legend=dict(font=dict(size=30)),
|
111 |
-
width=1600,
|
112 |
-
height=600,
|
113 |
)
|
114 |
return fig
|
115 |
|
@@ -177,9 +263,14 @@ if __name__ == '__main__':
|
|
177 |
outputs=view_synthetic_forward_backward)
|
178 |
|
179 |
with gr.Tab("Dataset Statistics"):
|
|
|
|
|
|
|
|
|
|
|
180 |
gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
|
181 |
|
182 |
-
edit_distance_gr_plot = gr.Plot()
|
183 |
|
184 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
185 |
outputs=view_manual)
|
@@ -193,6 +284,4 @@ if __name__ == '__main__':
|
|
193 |
application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
|
194 |
outputs=view_synthetic_forward_backward)
|
195 |
|
196 |
-
application.load(edit_distance_plot, outputs=edit_distance_gr_plot)
|
197 |
-
|
198 |
application.launch()
|
|
|
7 |
from scipy.stats import gaussian_kde
|
8 |
import plotly.graph_objects as go
|
9 |
|
10 |
+
from api_wrappers import hf_data_loader
|
11 |
from generation_steps.metrics_analysis import edit_distance_fn
|
12 |
|
13 |
+
colors = {"Expert-labeled": "#C19C0B",
|
14 |
+
"Synthetic Backward": "#913632",
|
15 |
+
"Synthetic Forward": "#58136a",
|
16 |
+
"Full": "#000000"}
|
17 |
+
|
18 |
+
|
19 |
+
df_related = generate_annotated_diffs.data_with_annotated_diffs()
|
20 |
|
21 |
|
22 |
def golden():
|
23 |
+
return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True)
|
24 |
|
25 |
|
26 |
def backward():
|
27 |
+
return df_related.loc[(df_related['G_type'] == "synthetic_backward") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True)
|
28 |
|
29 |
|
30 |
def forward():
|
31 |
+
return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "synthetic_forward")].reset_index(drop=True)
|
32 |
|
33 |
|
34 |
def forward_from_backward():
|
35 |
+
return df_related.loc[(df_related.G_type == "synthetic_backward") & (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True)
|
36 |
|
37 |
|
38 |
n_diffs_manual = len(golden())
|
|
|
63 |
return update_dataset_view(diff_idx, forward_from_backward())
|
64 |
|
65 |
|
66 |
+
def number_of_pairs_plot():
|
67 |
+
related_plot_dict = {"Full": df_related,
|
68 |
+
"Synthetic Backward": backward(),
|
69 |
+
"Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0,
|
70 |
+
ignore_index=True),
|
71 |
+
"Expert-labeled": golden()
|
72 |
+
}
|
73 |
+
|
74 |
+
df_unrelated = hf_data_loader.load_synthetic_as_pandas()
|
75 |
+
df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy()
|
76 |
+
unrelated_plot_dict = {"Full": df_unrelated,
|
77 |
+
"Synthetic Backward": df_unrelated.loc[
|
78 |
+
(df_unrelated['G_type'] == "synthetic_backward") & (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))],
|
79 |
+
"Synthetic Forward": df_unrelated.loc[
|
80 |
+
(
|
81 |
+
(df_unrelated['G_type'] == "initial") &
|
82 |
+
(df_unrelated['E_type'] == "synthetic_forward")
|
83 |
+
) | (
|
84 |
+
(df_unrelated['G_type'] == "synthetic_backward") &
|
85 |
+
(df_unrelated['E_type'].isin(["synthetic_forward", "synthetic_forward_from_backward"]))
|
86 |
+
)
|
87 |
+
],
|
88 |
+
"Expert-labeled": df_unrelated.loc[(df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")]}
|
89 |
+
|
90 |
+
traces = []
|
91 |
+
|
92 |
+
for split in related_plot_dict.keys():
|
93 |
+
related_count = len(related_plot_dict[split])
|
94 |
+
unrelated_count = len(unrelated_plot_dict[split])
|
95 |
+
|
96 |
+
traces.append(
|
97 |
+
go.Bar(
|
98 |
+
name=f'{split} - Related pairs',
|
99 |
+
x=[split],
|
100 |
+
y=[related_count],
|
101 |
+
marker=dict(
|
102 |
+
color=colors[split],
|
103 |
+
)
|
104 |
+
)
|
105 |
+
)
|
106 |
+
|
107 |
+
traces.append(
|
108 |
+
go.Bar(
|
109 |
+
name=f'{split} - Conditionally independent pairs',
|
110 |
+
x=[split],
|
111 |
+
y=[unrelated_count],
|
112 |
+
marker=dict(
|
113 |
+
color=colors[split],
|
114 |
+
pattern=dict(
|
115 |
+
shape='/', # Crosses
|
116 |
+
fillmode='overlay',
|
117 |
+
solidity=0.5
|
118 |
+
)
|
119 |
+
)
|
120 |
+
)
|
121 |
+
)
|
122 |
+
|
123 |
+
fig = go.Figure(data=traces)
|
124 |
+
|
125 |
+
fig.update_layout(
|
126 |
+
barmode='stack',
|
127 |
+
bargap=0.2,
|
128 |
+
xaxis=dict(
|
129 |
+
title="Split",
|
130 |
+
showgrid=True,
|
131 |
+
gridcolor='lightgrey'
|
132 |
+
),
|
133 |
+
yaxis=dict(
|
134 |
+
title="Number of Examples",
|
135 |
+
showgrid=True,
|
136 |
+
gridcolor='lightgrey'
|
137 |
+
),
|
138 |
+
legend=dict(
|
139 |
+
title='Pair Type',
|
140 |
+
orientation='h',
|
141 |
+
yanchor='bottom',
|
142 |
+
y=1.02,
|
143 |
+
xanchor='right',
|
144 |
+
x=1
|
145 |
+
),
|
146 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
147 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
148 |
+
width=1100,
|
149 |
+
)
|
150 |
+
return fig
|
151 |
+
|
152 |
+
|
153 |
def edit_distance_plot():
|
154 |
df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
|
155 |
+
df_related.iterrows()],
|
156 |
"Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
157 |
_, row in backward().iterrows()],
|
158 |
"Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
|
|
161 |
"Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
|
162 |
_, row in golden().iterrows()]
|
163 |
}
|
|
|
|
|
|
|
|
|
|
|
164 |
traces = []
|
165 |
|
166 |
for key in df_edit_distance:
|
|
|
180 |
fig.update_layout(
|
181 |
bargap=0.1,
|
182 |
xaxis=dict(
|
183 |
+
title=dict(text="Edit Distance"),
|
184 |
range=[0, 1200],
|
185 |
showgrid=True,
|
186 |
gridcolor='lightgrey'
|
187 |
),
|
188 |
yaxis=dict(
|
189 |
+
title=dict(text="Probability Density"),
|
190 |
range=[0, 0.004],
|
191 |
showgrid=True,
|
192 |
gridcolor='lightgrey',
|
|
|
195 |
),
|
196 |
plot_bgcolor='rgba(0,0,0,0)',
|
197 |
paper_bgcolor='rgba(0,0,0,0)',
|
198 |
+
width=1100,
|
|
|
|
|
|
|
199 |
)
|
200 |
return fig
|
201 |
|
|
|
263 |
outputs=view_synthetic_forward_backward)
|
264 |
|
265 |
with gr.Tab("Dataset Statistics"):
|
266 |
+
|
267 |
+
gr.Markdown("## Number of examples per split")
|
268 |
+
|
269 |
+
number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None)
|
270 |
+
|
271 |
gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
|
272 |
|
273 |
+
edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None)
|
274 |
|
275 |
application.load(update_dataset_view_manual, inputs=slider_manual,
|
276 |
outputs=view_manual)
|
|
|
284 |
application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
|
285 |
outputs=view_synthetic_forward_backward)
|
286 |
|
|
|
|
|
287 |
application.launch()
|