Petr Tsvetkov
Add noref gpt-eval to the pipeline
073db2c
raw
history blame
6.38 kB
import functools
import operator
import Levenshtein
import evaluate
import pandas as pd
from tqdm import tqdm
import config
from api_wrappers import hf_data_loader
from custom_metrics import gpt_eval
BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR)
def bleu_fn(pred, ref, **kwargs):
return BLEU.compute(predictions=[pred], references=[ref])["bleu"]
METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR)
def meteor_fn(pred, ref, **kwargs):
return METEOR.compute(predictions=[pred], references=[ref])["meteor"]
ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR)
def rouge1_fn(pred, ref, **kwargs):
return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"]
def rouge2_fn(pred, ref, **kwargs):
return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"]
def rougeL_fn(pred, ref, **kwargs):
return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"]
BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR)
def bertscore_fn(pred, ref, **kwargs):
return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0]
CHRF = evaluate.load("chrf")
def chrf_fn(pred, ref, **kwargs):
return CHRF.compute(predictions=[pred], references=[[ref]])["score"]
TER = evaluate.load("ter")
def ter_fn(pred, ref, **kwargs):
return TER.compute(predictions=[pred], references=[[ref]])["score"]
def edit_distance_fn(pred, ref, **kwargs):
return Levenshtein.distance(pred, ref)
def edit_time_fn(pred, ref, **kwargs):
return kwargs["edittime"]
def gptscore_ref_1_fn(pred, ref, **kwargs):
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1)
def gptscore_ref_3_fn(pred, ref, **kwargs):
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3)
def gptscore_ref_5_fn(pred, ref, **kwargs):
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
def gptscore_noref_1_fn(pred, ref, **kwargs):
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=1)
def gptscore_noref_3_fn(pred, ref, **kwargs):
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=3)
def gptscore_noref_5_fn(pred, ref, **kwargs):
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=5)
IND_METRICS = {
"gptscore-ref-1-req": gptscore_ref_1_fn,
# "gptscore-ref-3-req": gptscore_ref_3_fn,
# "gptscore-ref-5-req": gptscore_ref_5_fn,
"gptscore-noref-1-req": gptscore_noref_1_fn,
# "gptscore-noref-3-req": gptscore_noref_3_fn,
# "gptscore-noref-5-req": gptscore_noref_5_fn,
"editdist": edit_distance_fn,
"bleu": bleu_fn,
"meteor": meteor_fn,
"rouge1": rouge1_fn,
"rouge2": rouge2_fn,
"rougeL": rougeL_fn,
"bertscore": bertscore_fn,
"chrF": chrf_fn,
"ter": ter_fn,
}
REL_METRICS = {
"editdist": edit_distance_fn,
"edittime": edit_time_fn,
}
def attach_references(df):
reference_df = hf_data_loader.load_full_commit_as_pandas().set_index(["hash", "repo"])[["reference"]]
df = df.set_index(["hash", "repo"])
return df.join(other=reference_df, how="left").reset_index()
def compute_metrics(df):
tqdm.pandas()
def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
for metric in REL_METRICS:
print(f"Computing {metric} for the related pairs")
metric_fn = REL_METRICS[metric]
df[f"{metric}_related"] = df.progress_apply(
lambda row: apply_metric_fn_to_row(row=row,
fn=metric_fn,
col_pred="commit_msg_start",
col_ref="commit_msg_end"),
axis=1
)
for metric in IND_METRICS:
print(f"Computing {metric} for the independent pairs")
metric_fn = IND_METRICS[metric]
df[f"{metric}_independent"] = df.progress_apply(
lambda row: apply_metric_fn_to_row(row=row,
fn=metric_fn,
col_pred="commit_msg_start",
col_ref="reference"),
axis=1
)
for rel_metric in REL_METRICS:
for ind_metric in IND_METRICS:
df[f"rel_{rel_metric}_ind_{ind_metric}_pearson"] = (
df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="pearson"))
df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = (
df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman"))
return df
def correlations_for_group(group):
correlations = []
for rel_metric in REL_METRICS:
# correlations.append({
# f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"),
# f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman")
# })
for ind_metric in IND_METRICS:
correlations.append({
f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
group[f"{ind_metric}_independent"], method="pearson"),
f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
group[f"{ind_metric}_independent"], method="spearman"),
})
return pd.Series(functools.reduce(operator.ior, correlations, {}))
def compute_correlations(df: pd.DataFrame):
grouped_df = df.groupby(by=["end_to_start", "start_to_end"])
correlations = grouped_df.apply(correlations_for_group, include_groups=False)
return correlations
def transform(df):
print("Computing metrics")
df = attach_references(df)
df = compute_metrics(df)
correlations_for_groups = compute_correlations(df)
correlations_for_groups.to_csv(config.METRICS_CORRELATIONS_ARTIFACT)
df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT)
print("Done")
return df
def main():
df = pd.read_csv(config.START_TO_END_ARTIFACT, index_col=[0])
transform(df)
if __name__ == '__main__':
main()