Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

App Files Files Community

commit-message-editing-visualization / change_visualizer.py

saridormi

add a couple of plots and prettify README

d3a24ff 27 days ago

raw

history blame

10.8 kB

	import gradio as gr
	import pandas as pd

	import generate_annotated_diffs
	from evaluate.utils import parse_readme
	import numpy as np
	from scipy.stats import gaussian_kde
	import plotly.graph_objects as go

	from api_wrappers import hf_data_loader
	from generation_steps.metrics_analysis import edit_distance_fn

	colors = {"Expert-labeled": "#C19C0B",
	"Synthetic Backward": "#913632",
	"Synthetic Forward": "#58136a",
	"Full": "#000000"}


	df_related = generate_annotated_diffs.data_with_annotated_diffs()


	def golden():
	return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True)


	def backward():
	return df_related.loc[(df_related['G_type'] == "synthetic_backward") & (df_related['E_type'] == "expert_labeled")].reset_index(drop=True)


	def forward():
	return df_related.loc[(df_related['G_type'] == "initial") & (df_related['E_type'] == "synthetic_forward")].reset_index(drop=True)


	def forward_from_backward():
	return df_related.loc[(df_related.G_type == "synthetic_backward") & (df_related.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))].reset_index(drop=True)


	n_diffs_manual = len(golden())
	n_diffs_synthetic_backward = len(backward())
	n_diffs_synthetic_forward = len(forward())
	n_diffs_synthetic_forward_backward = len(forward_from_backward())

	def update_dataset_view(diff_idx, df):
	diff_idx -= 1
	return (df.iloc[diff_idx]['annotated_diff'],
	df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
	df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
	f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)


	def update_dataset_view_manual(diff_idx):
	return update_dataset_view(diff_idx, golden())


	def update_dataset_view_synthetic_backward(diff_idx):
	return update_dataset_view(diff_idx, backward())


	def update_dataset_view_synthetic_forward(diff_idx):
	return update_dataset_view(diff_idx, forward())

	def update_dataset_view_synthetic_forward_backward(diff_idx):
	return update_dataset_view(diff_idx, forward_from_backward())


	def number_of_pairs_plot():
	related_plot_dict = {"Full": df_related,
	"Synthetic Backward": backward(),
	"Synthetic Forward": pd.concat([forward(), forward_from_backward()], axis=0,
	ignore_index=True),
	"Expert-labeled": golden()
	}

	df_unrelated = hf_data_loader.load_synthetic_as_pandas()
	df_unrelated = df_unrelated.loc[~df_unrelated.is_related].copy()
	unrelated_plot_dict = {"Full": df_unrelated,
	"Synthetic Backward": df_unrelated.loc[
	(df_unrelated['G_type'] == "synthetic_backward") & (~df_unrelated.E_type.isin(["synthetic_forward", "synthetic_forward_from_backward"]))],
	"Synthetic Forward": df_unrelated.loc[
	(
	(df_unrelated['G_type'] == "initial") &
	(df_unrelated['E_type'] == "synthetic_forward")
	) \| (
	(df_unrelated['G_type'] == "synthetic_backward") &
	(df_unrelated['E_type'].isin(["synthetic_forward", "synthetic_forward_from_backward"]))
	)
	],
	"Expert-labeled": df_unrelated.loc[(df_unrelated.G_type == "initial") & (df_unrelated.E_type == "expert_labeled")]}

	traces = []

	for split in related_plot_dict.keys():
	related_count = len(related_plot_dict[split])
	unrelated_count = len(unrelated_plot_dict[split])

	traces.append(
	go.Bar(
	name=f'{split} - Related pairs',
	x=[split],
	y=[related_count],
	marker=dict(
	color=colors[split],
	)
	)
	)

	traces.append(
	go.Bar(
	name=f'{split} - Conditionally independent pairs',
	x=[split],
	y=[unrelated_count],
	marker=dict(
	color=colors[split],
	pattern=dict(
	shape='/', # Crosses
	fillmode='overlay',
	solidity=0.5
	)
	)
	)
	)

	fig = go.Figure(data=traces)

	fig.update_layout(
	barmode='stack',
	bargap=0.2,
	xaxis=dict(
	title="Split",
	showgrid=True,
	gridcolor='lightgrey'
	),
	yaxis=dict(
	title="Number of Examples",
	showgrid=True,
	gridcolor='lightgrey'
	),
	legend=dict(
	title='Pair Type',
	orientation='h',
	yanchor='bottom',
	y=1.02,
	xanchor='right',
	x=1
	),
	plot_bgcolor='rgba(0,0,0,0)',
	paper_bgcolor='rgba(0,0,0,0)',
	width=1100,
	)
	return fig


	def edit_distance_plot():
	df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in
	df_related.iterrows()],
	"Synthetic Backward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
	_, row in backward().iterrows()],
	"Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
	_, row in pd.concat([forward(), forward_from_backward()], axis=0,
	ignore_index=True).iterrows()],
	"Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for
	_, row in golden().iterrows()]
	}
	traces = []

	for key in df_edit_distance:
	kde_x = np.linspace(0, 1200, 1000)
	kde = gaussian_kde(df_edit_distance[key])
	kde_line = go.Scatter(
	x=kde_x,
	y=kde(kde_x),
	mode='lines',
	name=key,
	line=dict(color=colors[key], width=5)
	)
	traces.append(kde_line)

	fig = go.Figure(data=traces)

	fig.update_layout(
	bargap=0.1,
	xaxis=dict(
	title=dict(text="Edit Distance"),
	range=[0, 1200],
	showgrid=True,
	gridcolor='lightgrey'
	),
	yaxis=dict(
	title=dict(text="Probability Density"),
	range=[0, 0.004],
	showgrid=True,
	gridcolor='lightgrey',
	tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
	tickformat=".4f"
	),
	plot_bgcolor='rgba(0,0,0,0)',
	paper_bgcolor='rgba(0,0,0,0)',
	width=1100,
	)
	return fig

	force_light_theme_js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}
	}
	"""

	if __name__ == '__main__':
	with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:

	gr.Markdown(parse_readme("README.md"))

	def dataset_view_tab(n_items):
	slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
	label=f"Sample number (total: {n_items})")

	diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
	start_view = gr.Textbox(interactive=False, label="Initial message G", container=True)
	end_view = gr.Textbox(interactive=False, label="Edited message E", container=True)
	link_view = gr.Markdown()

	view = [
	diff_view,
	start_view,
	end_view,
	link_view
	]

	return slider, view

	with gr.Tab("Examples Exploration"):
	with gr.Tab("Manual"):
	slider_manual, view_manual = dataset_view_tab(n_diffs_manual)

	slider_manual.change(update_dataset_view_manual,
	inputs=slider_manual,
	outputs=view_manual)

	with gr.Tab("Synthetic Backward"):
	slider_synthetic_backward, view_synthetic_backward = dataset_view_tab(n_diffs_synthetic_backward)

	slider_synthetic_backward.change(update_dataset_view_synthetic_backward,
	inputs=slider_synthetic_backward,
	outputs=view_synthetic_backward)

	with gr.Tab("Synthetic Forward (from initial)"):
	slider_synthetic_forward, view_synthetic_forward = dataset_view_tab(n_diffs_synthetic_forward)

	slider_synthetic_forward.change(update_dataset_view_synthetic_forward,
	inputs=slider_synthetic_forward,
	outputs=view_synthetic_forward)

	with gr.Tab("Synthetic Forward (from backward)"):
	slider_synthetic_forward_backward, view_synthetic_forward_backward = dataset_view_tab(n_diffs_synthetic_forward_backward)

	slider_synthetic_forward_backward.change(update_dataset_view_synthetic_forward_backward,
	inputs=slider_synthetic_forward_backward,
	outputs=view_synthetic_forward_backward)

	with gr.Tab("Dataset Statistics"):

	gr.Markdown("## Number of examples per split")

	number_of_pairs_gr_plot = gr.Plot(number_of_pairs_plot, label=None)

	gr.Markdown("## Edit Distance Distribution (w/o PyCharm Logs)")

	edit_distance_gr_plot = gr.Plot(edit_distance_plot(), label=None)

	application.load(update_dataset_view_manual, inputs=slider_manual,
	outputs=view_manual)

	application.load(update_dataset_view_synthetic_backward, inputs=slider_synthetic_backward,
	outputs=view_synthetic_backward)

	application.load(update_dataset_view_synthetic_forward, inputs=slider_synthetic_forward,
	outputs=view_synthetic_forward)

	application.load(update_dataset_view_synthetic_forward_backward, inputs=slider_synthetic_forward_backward,
	outputs=view_synthetic_forward_backward)

	application.launch()