|
import gradio as gr |
|
import pandas as pd |
|
|
|
import generate_annotated_diffs |
|
from evaluate.utils import parse_readme |
|
import numpy as np |
|
from scipy.stats import gaussian_kde |
|
import plotly.graph_objects as go |
|
|
|
from api_wrappers import hf_data_loader |
|
from generation_steps.metrics_analysis import edit_distance_fn |
|
|
|
colors = {"Expert-labeled": "#C19C0B", |
|
"Synthetic Backward": "#913632", |
|
"Synthetic Forward": "#58136a", |
|
"Full": "#000000"} |
|
|
|
|
|
df_related = generate_annotated_diffs.data_with_annotated_diffs() |
|
|
|
|
|
def golden(): |
|
return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True) |
|
|
|
|
|
def backward(): |
|
return df_related.loc[(df_related['G_type'] == "synthetic_backward") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True) |
|
|
|
|
|
def forward(): |
|
return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "synthetic_forward")].reset_index(drop=True) |
|
|
|
|
|
def forward_from_backward(): |
|
return df_related.loc[(df_related.G_type == "synthetic_backward") & (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True) |
|
|
|
|
|
n_diffs_manual = len(golden()) |
|
n_diffs_synthetic_backward = len(backward()) |
|
n_diffs_synthetic_forward = len(forward()) |
|
n_diffs_synthetic_forward_backward = len(forward_from_backward()) |
|
|
|
def update_dataset_view(diff_idx, df): |
|
diff_idx -= 1 |
|
return (df.iloc[diff_idx]['annotated_diff'], |
|
df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'], |
|
df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'], |
|
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",) |
|
|
|
|
|
def update_dataset_view_manual(diff_idx): |
|
return update_dataset_view(diff_idx, golden()) |
|
|
|
|
|
def update_dataset_view_synthetic_backward(diff_idx): |
|
return update_dataset_view(diff_idx, backward()) |
|
|
|
|
|
def update_dataset_view_synthetic_forward(diff_idx): |
|
return update_dataset_view(diff_idx, forward()) |
|
|
|
def update_dataset_view_synthetic_forward_backward(diff_idx): |
|
return update_dataset_view(diff_idx, forward_from_backward()) |
|
|
|
|
|
def number_of_pairs_plot(): |
|
related_plot_dict = {"Full": df_related, |
|
"Synthetic Backward": backward(), |
|
"Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0, |
|
ignore_index=True), |
|
"Expert-labeled": golden() |
|
} |
|
|
|
df_unrelated = hf_data_loader.load_synthetic_as_pandas() |
|
df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy() |
|
unrelated_plot_dict = {"Full": df_unrelated, |
|
"Synthetic Backward": df_unrelated.loc[ |
|
(df_unrelated['G_type'] == "synthetic_backward") & (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))], |
|
"Synthetic Forward": df_unrelated.loc[ |
|
( |
|
(df_unrelated['G_type'] == "initial") & |
|
(df_unrelated['E_type'] == "synthetic_forward") |
|
) | ( |
|
(df_unrelated['G_type'] == "synthetic_backward") & |
|
(df_unrelated['E_type'].isin(["synthetic_forward", "synthetic_forward_from_backward"])) |
|
) |
|
], |
|
"Expert-labeled": df_unrelated.loc[(df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")]} |
|
|
|
traces = [] |
|
|
|
for split in related_plot_dict.keys(): |
|
related_count = len(related_plot_dict[split]) |
|
unrelated_count = len(unrelated_plot_dict[split]) |
|
|
|
traces.append( |
|
go.Bar( |
|
name=f'{split} - Related pairs', |
|
x=[split], |
|
y=[related_count], |
|
marker=dict( |
|
color=colors[split], |
|
) |
|
) |
|
) |
|
|
|
traces.append( |
|
go.Bar( |
|
name=f'{split} - Conditionally independent pairs', |
|
x=[split], |
|
y=[unrelated_count], |
|
marker=dict( |
|
color=colors[split], |
|
pattern=dict( |
|
shape='/', |
|
fillmode='overlay', |
|
solidity=0.5 |
|
) |
|
) |
|
) |
|
) |
|
|
|
fig = go.Figure(data=traces) |
|
|
|
fig.update_layout( |
|
barmode='stack', |
|
bargap=0.2, |
|
xaxis=dict( |
|
title="Split", |
|
showgrid=True, |
|
gridcolor='lightgrey' |
|
), |
|
yaxis=dict( |
|
title="Number of Examples", |
|
showgrid=True, |
|
gridcolor='lightgrey' |
|
), |
|
legend=dict( |
|
title='Pair Type', |
|
orientation='h', |
|
yanchor='bottom', |
|
y=1.02, |
|
xanchor='right', |
|
x=1 |
|
), |
|
plot_bgcolor='rgba(0,0,0,0)', |
|
paper_bgcolor='rgba(0,0,0,0)', |
|
width=1100, |
|
) |
|
return fig |
|
|
|
|
|
def edit_distance_plot(): |
|
df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in |
|
df_related.iterrows()], |
|
"Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for |
|
_, row in backward().iterrows()], |
|
"Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for |
|
_, row in pd.concat([forward(), forward_from_backward()], axis=0, |
|
ignore_index=True).iterrows()], |
|
"Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for |
|
_, row in golden().iterrows()] |
|
} |
|
traces = [] |
|
|
|
for key in df_edit_distance: |
|
kde_x = np.linspace(0, 1200, 1000) |
|
kde = gaussian_kde(df_edit_distance[key]) |
|
kde_line = go.Scatter( |
|
x=kde_x, |
|
y=kde(kde_x), |
|
mode='lines', |
|
name=key, |
|
line=dict(color=colors[key], width=5) |
|
) |
|
traces.append(kde_line) |
|
|
|
fig = go.Figure(data=traces) |
|
|
|
fig.update_layout( |
|
bargap=0.1, |
|
xaxis=dict( |
|
title=dict(text="Edit Distance"), |
|
range=[0, 1200], |
|
showgrid=True, |
|
gridcolor='lightgrey' |
|
), |
|
yaxis=dict( |
|
title=dict(text="Probability Density"), |
|
range=[0, 0.004], |
|
showgrid=True, |
|
gridcolor='lightgrey', |
|
tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004], |
|
tickformat=".4f" |
|
), |
|
plot_bgcolor='rgba(0,0,0,0)', |
|
paper_bgcolor='rgba(0,0,0,0)', |
|
width=1100, |
|
) |
|
return fig |
|
|
|
force_light_theme_js_func = """ |
|
function refresh() { |
|
const url = new URL(window.location); |
|
|
|
if (url.searchParams.get('__theme') !== 'light') { |
|
url.searchParams.set('__theme', 'light'); |
|
window.location.href = url.href; |
|
} |
|
} |
|
""" |
|
|
|
if __name__ == '__main__': |
|
with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application: |
|
|
|
gr.Markdown(parse_readme("README.md")) |
|
|
|
def dataset_view_tab(n_items): |
|
slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1, |
|
label=f"Sample number (total: {n_items})") |
|
|
|
diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"}) |
|
start_view = gr.Textbox(interactive=False, label="Initial message G", container=True) |
|
end_view = gr.Textbox(interactive=False, label="Edited message E", container=True) |
|
link_view = gr.Markdown() |
|
|
|
view = [ |
|
diff_view, |
|
start_view, |
|
end_view, |
|
link_view |
|
] |
|
|
|
return slider, view |
|
|
|
with gr.Tab("Examples Exploration"): |
|
with gr.Tab("Manual"): |
|
slider_manual, view_manual = dataset_view_tab(n_diffs_manual) |
|
|
|
slider_manual.change(update_dataset_view_manual, |
|
inputs=slider_manual, |
|
outputs=view_manual) |
|
|
|
with gr.Tab("Synthetic Backward"): |
|
slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward) |
|
|
|
slider_synthetic_backward.change(update_dataset_view_synthetic_backward, |
|
inputs=slider_synthetic_backward, |
|
outputs=view_synthetic_backward) |
|
|
|
with gr.Tab("Synthetic Forward (from initial)"): |
|
slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward) |
|
|
|
slider_synthetic_forward.change(update_dataset_view_synthetic_forward, |
|
inputs=slider_synthetic_forward, |
|
outputs=view_synthetic_forward) |
|
|
|
with gr.Tab("Synthetic Forward (from backward)"): |
|
slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(n_diffs_synthetic_forward_backward) |
|
|
|
slider_synthetic_forward_backward.change(update_dataset_view_synthetic_forward_backward, |
|
inputs=slider_synthetic_forward_backward, |
|
outputs=view_synthetic_forward_backward) |
|
|
|
with gr.Tab("Dataset Statistics"): |
|
|
|
gr.Markdown("## Number of examples per split") |
|
|
|
number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None) |
|
|
|
gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)") |
|
|
|
edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None) |
|
|
|
application.load(update_dataset_view_manual, inputs=slider_manual, |
|
outputs=view_manual) |
|
|
|
application.load(update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward, |
|
outputs=view_synthetic_backward) |
|
|
|
application.load(update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward, |
|
outputs=view_synthetic_forward) |
|
|
|
application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward, |
|
outputs=view_synthetic_forward_backward) |
|
|
|
application.launch() |
|
|