import pickle import Levenshtein import numpy as np import pandas as pd import plotly.figure_factory as ff from scipy.stats import stats import config def get_statistics_for_sample(start_msg, end_msg, row=None): edit_ops = Levenshtein.editops(start_msg, end_msg) n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops]) n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops]) n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops]) n_changes = n_deletes + n_inserts + n_replaces n_deletes += n_replaces n_inserts += n_replaces return { "deletions": n_deletes, "insertions": n_inserts, "changes": n_changes, "deletions_norm": n_deletes / len(start_msg), "insertions_norm": n_inserts / len(end_msg), "changes_norm": n_changes / len(end_msg), "lendiff": abs(len(start_msg) - len(end_msg)), "editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg), } def get_statistics_for_row(row): if "commit_msg_start" in row: start = row['commit_msg_start'] else: start = row["G_text"] if "commit_msg_end" in row: end = row['commit_msg_end'] else: end = row["E_text"] return get_statistics_for_sample(start, end, row=row) def get_statistics_for_df(df: pd.DataFrame): stats = [get_statistics_for_row(row) for _, row in df.iterrows()] assert len(stats) > 0 return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]} def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name): hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)] group_labels = ['Golden', 'e2s', 's2e', 'e2s+s2e', 'Synthetic'] fig = ff.create_distplot(hist_data, group_labels, bin_size=.05, show_rug=False, show_hist=False) fig.update_layout(title_text=stat_name) with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f: pickle.dump(hist_data, f) return fig