import gradio as gr import pandas as pd import generate_annotated_diffs from evaluate.utils import parse_readme import numpy as np from scipy.stats import gaussian_kde import plotly.graph_objects as go from api_wrappers import hf_data_loader from generation_steps.metrics_analysis import edit_distance_fn colors = {"Expert-labeled": "#C19C0B", "Synthetic Backward": "#913632", "Synthetic Forward": "#58136a", "Full": "#000000"} df_related = generate_annotated_diffs.data_with_annotated_diffs() def golden(): return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True) def backward(): return df_related.loc[(df_related['G_type'] == "synthetic_backward") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True) def forward(): return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "synthetic_forward")].reset_index(drop=True) def forward_from_backward(): return df_related.loc[(df_related.G_type == "synthetic_backward") & (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True) n_diffs_manual = len(golden()) n_diffs_synthetic_backward = len(backward()) n_diffs_synthetic_forward = len(forward()) n_diffs_synthetic_forward_backward = len(forward_from_backward()) def update_dataset_view(diff_idx, df): diff_idx -= 1 return (df.iloc[diff_idx]['annotated_diff'], df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'], df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'], f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",) def update_dataset_view_manual(diff_idx): return update_dataset_view(diff_idx, golden()) def update_dataset_view_synthetic_backward(diff_idx): return update_dataset_view(diff_idx, backward()) def update_dataset_view_synthetic_forward(diff_idx): return update_dataset_view(diff_idx, forward()) def update_dataset_view_synthetic_forward_backward(diff_idx): return update_dataset_view(diff_idx, forward_from_backward()) def number_of_pairs_plot(): related_plot_dict = {"Full": df_related, "Synthetic Backward": backward(), "Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True), "Expert-labeled": golden() } df_unrelated = hf_data_loader.load_synthetic_as_pandas() df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy() unrelated_plot_dict = {"Full": df_unrelated, "Synthetic Backward": df_unrelated.loc[ (df_unrelated['G_type'] == "synthetic_backward") & (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))], "Synthetic Forward": df_unrelated.loc[ ( (df_unrelated['G_type'] == "initial") & (df_unrelated['E_type'] == "synthetic_forward") ) | ( (df_unrelated['G_type'] == "synthetic_backward") & (df_unrelated['E_type'].isin(["synthetic_forward", "synthetic_forward_from_backward"])) ) ], "Expert-labeled": df_unrelated.loc[(df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")]} traces = [] for split in related_plot_dict.keys(): related_count = len(related_plot_dict[split]) unrelated_count = len(unrelated_plot_dict[split]) traces.append( go.Bar( name=f'{split} - Related pairs', x=[split], y=[related_count], marker=dict( color=colors[split], ) ) ) traces.append( go.Bar( name=f'{split} - Conditionally independent pairs', x=[split], y=[unrelated_count], marker=dict( color=colors[split], pattern=dict( shape='/', # Crosses fillmode='overlay', solidity=0.5 ) ) ) ) fig = go.Figure(data=traces) fig.update_layout( barmode='stack', bargap=0.2, xaxis=dict( title="Split", showgrid=True, gridcolor='lightgrey' ), yaxis=dict( title="Number of Examples", showgrid=True, gridcolor='lightgrey' ), legend=dict( title='Pair Type', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1 ), plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', width=1100, ) return fig def edit_distance_plot(): df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df_related.iterrows()], "Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in backward().iterrows()], "Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in pd.concat([forward(), forward_from_backward()], axis=0, ignore_index=True).iterrows()], "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in golden().iterrows()] } traces = [] for key in df_edit_distance: kde_x = np.linspace(0, 1200, 1000) kde = gaussian_kde(df_edit_distance[key]) kde_line = go.Scatter( x=kde_x, y=kde(kde_x), mode='lines', name=key, line=dict(color=colors[key], width=5) ) traces.append(kde_line) fig = go.Figure(data=traces) fig.update_layout( bargap=0.1, xaxis=dict( title=dict(text="Edit Distance"), range=[0, 1200], showgrid=True, gridcolor='lightgrey' ), yaxis=dict( title=dict(text="Probability Density"), range=[0, 0.004], showgrid=True, gridcolor='lightgrey', tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004], tickformat=".4f" ), plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', width=1100, ) return fig force_light_theme_js_func = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'light') { url.searchParams.set('__theme', 'light'); window.location.href = url.href; } } """ if __name__ == '__main__': with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application: gr.Markdown(parse_readme("README.md")) def dataset_view_tab(n_items): slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1, label=f"Sample number (total: {n_items})") diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"}) start_view = gr.Textbox(interactive=False, label="Initial message G", container=True) end_view = gr.Textbox(interactive=False, label="Edited message E", container=True) link_view = gr.Markdown() view = [ diff_view, start_view, end_view, link_view ] return slider, view with gr.Tab("Examples Exploration"): with gr.Tab("Manual"): slider_manual, view_manual = dataset_view_tab(n_diffs_manual) slider_manual.change(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual) with gr.Tab("Synthetic Backward"): slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward) slider_synthetic_backward.change(update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward, outputs=view_synthetic_backward) with gr.Tab("Synthetic Forward (from initial)"): slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward) slider_synthetic_forward.change(update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward, outputs=view_synthetic_forward) with gr.Tab("Synthetic Forward (from backward)"): slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(n_diffs_synthetic_forward_backward) slider_synthetic_forward_backward.change(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward, outputs=view_synthetic_forward_backward) with gr.Tab("Dataset Statistics"): gr.Markdown("## Number of examples per split") number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None) gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)") edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None) application.load(update_dataset_view_manual, inputs=slider_manual, outputs=view_manual) application.load(update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward, outputs=view_synthetic_backward) application.load(update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward, outputs=view_synthetic_forward) application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward, outputs=view_synthetic_forward_backward) application.launch()