saridormi commited on
Commit
d3a24ff
β€’
1 Parent(s): 216d66f

add a couple of plots and prettify README

Browse files
Files changed (2) hide show
  1. README.md +8 -8
  2. change_visualizer.py +109 -20
README.md CHANGED
@@ -8,18 +8,18 @@ app_file: change_visualizer.py
8
 
9
  # Commit Message Editing Visualisation βœοΈπŸ”πŸ“Š
10
 
11
- This space provides a visualization app for exploring the commit message edits datasets (πŸ€—[expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and πŸ€—[synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
12
- from [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
13
 
14
  ## Artifacts
15
 
16
- * [`metrics_analysis.ipynb`](metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
17
- * [`chart.ipynb`](chart.ipynb) contains the code for Figure 4 with edit distance distribution;
18
- * [`data_stats.ipynb`](data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
19
- * [`generation_steps/synthetic_backward.py`](generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
20
- * [`generation_steps/synthetic_forward.py`](generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
21
 
22
  ## Visualization
23
 
24
  * πŸ” Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
25
- * πŸ“Š Click on `Dataset Statistics` tab to see the major statistics for our dataset.
 
8
 
9
  # Commit Message Editing Visualisation βœοΈπŸ”πŸ“Š
10
 
11
+ This space provides a visualization app for exploring the commit message edits datasets (πŸ€— [expert-labeled](https://huggingface.co/datasets/JetBrains-Research/commit-msg-edits) and πŸ€— [synthetic](https://huggingface.co/datasets/JetBrains-Research/synthetic-commit-msg-edits))
12
+ from πŸ“œ [Towards Realistic Evaluation of Commit Message Generation by Matching Online and Offline Settings](https://arxiv.org/abs/2410.12046) paper and also hosts important artifacts from our work.
13
 
14
  ## Artifacts
15
 
16
+ * πŸ“Š[`metrics_analysis.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/metrics_analysis.ipynb) contains the code for metrics calculation and analysis;
17
+ * πŸ“ˆ[`chart.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/chart.ipynb) contains the code for Figure 4 with edit distance distribution;
18
+ * πŸ—ƒοΈ[`data_stats.ipynb`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/data_stats.ipynb) contains the code for obtaining the dataset statistics from Table 1;
19
+ * ⬅️[`generation_steps/synthetic_backward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_backward.py) contains the code for *Synthetic Backward* generation proposed in our paper;
20
+ * ➑️[`generation_steps/synthetic_forward.py`](https://huggingface.co/spaces/JetBrains-Research/commit-message-editing-visualization/blob/main/generation_steps/synthetic_forward.py) contains the code for *Synthetic Forward* generation proposed in our paper.
21
 
22
  ## Visualization
23
 
24
  * πŸ” Click on `Examples Exploration` tab to browse through nicely-formatted examples from our dataset.
25
+ * πŸ“ˆ Click on `Dataset Statistics` tab to see the major statistics for our dataset.
change_visualizer.py CHANGED
@@ -7,25 +7,32 @@ import numpy as np
7
  from scipy.stats import gaussian_kde
8
  import plotly.graph_objects as go
9
 
 
10
  from generation_steps.metrics_analysis import edit_distance_fn
11
 
12
- df = generate_annotated_diffs.data_with_annotated_diffs()
 
 
 
 
 
 
13
 
14
 
15
  def golden():
16
- return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
17
 
18
 
19
  def backward():
20
- return df.loc[(df['G_type'] == "synthetic_backward") & (df['E_type'] == "expert_labeled")].reset_index(drop=True)
21
 
22
 
23
  def forward():
24
- return df.loc[(df['G_type'] == "initial") & (df['E_type'] == "synthetic_forward")].reset_index(drop=True)
25
 
26
 
27
  def forward_from_backward():
28
- return df.loc[(df.G_type == "synthetic_backward") & (df.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True)
29
 
30
 
31
  n_diffs_manual = len(golden())
@@ -56,9 +63,96 @@ def update_dataset_view_synthetic_forward_backward(diff_idx):
56
  return update_dataset_view(diff_idx, forward_from_backward())
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def edit_distance_plot():
60
  df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
61
- df.iterrows()],
62
  "Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
63
  _, row in backward().iterrows()],
64
  "Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
@@ -67,11 +161,6 @@ def edit_distance_plot():
67
  "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
68
  _, row in golden().iterrows()]
69
  }
70
-
71
- colors = {"Expert-labeled": "#C19C0B",
72
- "Synthetic Backward": "#913632",
73
- "Synthetic Forward": "#58136a",
74
- "Full": "#000000"}
75
  traces = []
76
 
77
  for key in df_edit_distance:
@@ -91,13 +180,13 @@ def edit_distance_plot():
91
  fig.update_layout(
92
  bargap=0.1,
93
  xaxis=dict(
94
- title=dict(text="Edit Distance", font=dict(size=30)),
95
  range=[0, 1200],
96
  showgrid=True,
97
  gridcolor='lightgrey'
98
  ),
99
  yaxis=dict(
100
- title=dict(text="Probability Density", font=dict(size=30)),
101
  range=[0, 0.004],
102
  showgrid=True,
103
  gridcolor='lightgrey',
@@ -106,10 +195,7 @@ def edit_distance_plot():
106
  ),
107
  plot_bgcolor='rgba(0,0,0,0)',
108
  paper_bgcolor='rgba(0,0,0,0)',
109
- font=dict(size=24),
110
- legend=dict(font=dict(size=30)),
111
- width=1600,
112
- height=600,
113
  )
114
  return fig
115
 
@@ -177,9 +263,14 @@ if __name__ == '__main__':
177
  outputs=view_synthetic_forward_backward)
178
 
179
  with gr.Tab("Dataset Statistics"):
 
 
 
 
 
180
  gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
181
 
182
- edit_distance_gr_plot = gr.Plot()
183
 
184
  application.load(update_dataset_view_manual, inputs=slider_manual,
185
  outputs=view_manual)
@@ -193,6 +284,4 @@ if __name__ == '__main__':
193
  application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
194
  outputs=view_synthetic_forward_backward)
195
 
196
- application.load(edit_distance_plot, outputs=edit_distance_gr_plot)
197
-
198
  application.launch()
 
7
  from scipy.stats import gaussian_kde
8
  import plotly.graph_objects as go
9
 
10
+ from api_wrappers import hf_data_loader
11
  from generation_steps.metrics_analysis import edit_distance_fn
12
 
13
+ colors = {"Expert-labeled": "#C19C0B",
14
+ "Synthetic Backward": "#913632",
15
+ "Synthetic Forward": "#58136a",
16
+ "Full": "#000000"}
17
+
18
+
19
+ df_related = generate_annotated_diffs.data_with_annotated_diffs()
20
 
21
 
22
  def golden():
23
+ return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True)
24
 
25
 
26
  def backward():
27
+ return df_related.loc[(df_related['G_type'] == "synthetic_backward") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True)
28
 
29
 
30
  def forward():
31
+ return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "synthetic_forward")].reset_index(drop=True)
32
 
33
 
34
  def forward_from_backward():
35
+ return df_related.loc[(df_related.G_type == "synthetic_backward") & (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True)
36
 
37
 
38
  n_diffs_manual = len(golden())
 
63
  return update_dataset_view(diff_idx, forward_from_backward())
64
 
65
 
66
+ def number_of_pairs_plot():
67
+ related_plot_dict = {"Full": df_related,
68
+ "Synthetic Backward": backward(),
69
+ "Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0,
70
+ ignore_index=True),
71
+ "Expert-labeled": golden()
72
+ }
73
+
74
+ df_unrelated = hf_data_loader.load_synthetic_as_pandas()
75
+ df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy()
76
+ unrelated_plot_dict = {"Full": df_unrelated,
77
+ "Synthetic Backward": df_unrelated.loc[
78
+ (df_unrelated['G_type'] == "synthetic_backward") & (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))],
79
+ "Synthetic Forward": df_unrelated.loc[
80
+ (
81
+ (df_unrelated['G_type'] == "initial") &
82
+ (df_unrelated['E_type'] == "synthetic_forward")
83
+ ) | (
84
+ (df_unrelated['G_type'] == "synthetic_backward") &
85
+ (df_unrelated['E_type'].isin(["synthetic_forward", "synthetic_forward_from_backward"]))
86
+ )
87
+ ],
88
+ "Expert-labeled": df_unrelated.loc[(df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")]}
89
+
90
+ traces = []
91
+
92
+ for split in related_plot_dict.keys():
93
+ related_count = len(related_plot_dict[split])
94
+ unrelated_count = len(unrelated_plot_dict[split])
95
+
96
+ traces.append(
97
+ go.Bar(
98
+ name=f'{split} - Related pairs',
99
+ x=[split],
100
+ y=[related_count],
101
+ marker=dict(
102
+ color=colors[split],
103
+ )
104
+ )
105
+ )
106
+
107
+ traces.append(
108
+ go.Bar(
109
+ name=f'{split} - Conditionally independent pairs',
110
+ x=[split],
111
+ y=[unrelated_count],
112
+ marker=dict(
113
+ color=colors[split],
114
+ pattern=dict(
115
+ shape='/', # Crosses
116
+ fillmode='overlay',
117
+ solidity=0.5
118
+ )
119
+ )
120
+ )
121
+ )
122
+
123
+ fig = go.Figure(data=traces)
124
+
125
+ fig.update_layout(
126
+ barmode='stack',
127
+ bargap=0.2,
128
+ xaxis=dict(
129
+ title="Split",
130
+ showgrid=True,
131
+ gridcolor='lightgrey'
132
+ ),
133
+ yaxis=dict(
134
+ title="Number of Examples",
135
+ showgrid=True,
136
+ gridcolor='lightgrey'
137
+ ),
138
+ legend=dict(
139
+ title='Pair Type',
140
+ orientation='h',
141
+ yanchor='bottom',
142
+ y=1.02,
143
+ xanchor='right',
144
+ x=1
145
+ ),
146
+ plot_bgcolor='rgba(0,0,0,0)',
147
+ paper_bgcolor='rgba(0,0,0,0)',
148
+ width=1100,
149
+ )
150
+ return fig
151
+
152
+
153
  def edit_distance_plot():
154
  df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
155
+ df_related.iterrows()],
156
  "Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
157
  _, row in backward().iterrows()],
158
  "Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
 
161
  "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
162
  _, row in golden().iterrows()]
163
  }
 
 
 
 
 
164
  traces = []
165
 
166
  for key in df_edit_distance:
 
180
  fig.update_layout(
181
  bargap=0.1,
182
  xaxis=dict(
183
+ title=dict(text="Edit Distance"),
184
  range=[0, 1200],
185
  showgrid=True,
186
  gridcolor='lightgrey'
187
  ),
188
  yaxis=dict(
189
+ title=dict(text="Probability Density"),
190
  range=[0, 0.004],
191
  showgrid=True,
192
  gridcolor='lightgrey',
 
195
  ),
196
  plot_bgcolor='rgba(0,0,0,0)',
197
  paper_bgcolor='rgba(0,0,0,0)',
198
+ width=1100,
 
 
 
199
  )
200
  return fig
201
 
 
263
  outputs=view_synthetic_forward_backward)
264
 
265
  with gr.Tab("Dataset Statistics"):
266
+
267
+ gr.Markdown("## Number of examples per split")
268
+
269
+ number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None)
270
+
271
  gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")
272
 
273
+ edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None)
274
 
275
  application.load(update_dataset_view_manual, inputs=slider_manual,
276
  outputs=view_manual)
 
284
  application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
285
  outputs=view_synthetic_forward_backward)
286
 
 
 
287
  application.launch()