Petr Tsvetkov
commited on
Commit
β’
073db2c
1
Parent(s):
f5faae7
Add noref gpt-eval to the pipeline
Browse files
custom_metrics/gpt_eval.py
CHANGED
@@ -20,6 +20,24 @@ lowest quality and 10 is the highest quality. Do not include any other text or e
|
|
20 |
"""
|
21 |
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
N_RETRIES = 3
|
24 |
|
25 |
|
@@ -51,3 +69,13 @@ def compute_ref(prediction, reference, n_requests):
|
|
51 |
]
|
52 |
|
53 |
return sum(results) / len(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
"""
|
21 |
|
22 |
|
23 |
+
def build_prompt_noref(prediction, diff):
|
24 |
+
return f"""Evaluate the following commit message based on clarity, specificity, context, and conciseness without
|
25 |
+
providing any additional feedback or commentary:
|
26 |
+
|
27 |
+
START OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
|
28 |
+
{prediction}
|
29 |
+
END OF THE COMMIT MESSAGE YOU HAVE TO EVALUATE
|
30 |
+
|
31 |
+
These are the code changes included in the commit:
|
32 |
+
START OF THE CODE CHANGES
|
33 |
+
{diff}
|
34 |
+
END OF THE CODE CHANGES
|
35 |
+
|
36 |
+
YOUR TASK: Provide a single number as a response, representing the rating on a scale from 1 to 10, where 1 is the
|
37 |
+
lowest quality and 10 is the highest quality. Do not include any other text or explanation in your response.
|
38 |
+
"""
|
39 |
+
|
40 |
+
|
41 |
N_RETRIES = 3
|
42 |
|
43 |
|
|
|
69 |
]
|
70 |
|
71 |
return sum(results) / len(results)
|
72 |
+
|
73 |
+
|
74 |
+
def compute_noref(prediction, diff, n_requests):
|
75 |
+
prompt = build_prompt_noref(prediction, diff)
|
76 |
+
results = [
|
77 |
+
get_number_for_prompt(prompt)
|
78 |
+
for _ in range(n_requests)
|
79 |
+
]
|
80 |
+
|
81 |
+
return sum(results) / len(results)
|
generation_steps/metrics_analysis.py
CHANGED
@@ -80,10 +80,25 @@ def gptscore_ref_5_fn(pred, ref, **kwargs):
|
|
80 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
81 |
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
IND_METRICS = {
|
84 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
85 |
-
"gptscore-ref-3-req": gptscore_ref_3_fn,
|
86 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
|
|
|
|
|
|
87 |
"editdist": edit_distance_fn,
|
88 |
"bleu": bleu_fn,
|
89 |
"meteor": meteor_fn,
|
@@ -111,7 +126,7 @@ def compute_metrics(df):
|
|
111 |
tqdm.pandas()
|
112 |
|
113 |
def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
|
114 |
-
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'])
|
115 |
|
116 |
for metric in REL_METRICS:
|
117 |
print(f"Computing {metric} for the related pairs")
|
|
|
80 |
return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5)
|
81 |
|
82 |
|
83 |
+
def gptscore_noref_1_fn(pred, ref, **kwargs):
|
84 |
+
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=1)
|
85 |
+
|
86 |
+
|
87 |
+
def gptscore_noref_3_fn(pred, ref, **kwargs):
|
88 |
+
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=3)
|
89 |
+
|
90 |
+
|
91 |
+
def gptscore_noref_5_fn(pred, ref, **kwargs):
|
92 |
+
return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=5)
|
93 |
+
|
94 |
+
|
95 |
IND_METRICS = {
|
96 |
"gptscore-ref-1-req": gptscore_ref_1_fn,
|
97 |
+
# "gptscore-ref-3-req": gptscore_ref_3_fn,
|
98 |
# "gptscore-ref-5-req": gptscore_ref_5_fn,
|
99 |
+
"gptscore-noref-1-req": gptscore_noref_1_fn,
|
100 |
+
# "gptscore-noref-3-req": gptscore_noref_3_fn,
|
101 |
+
# "gptscore-noref-5-req": gptscore_noref_5_fn,
|
102 |
"editdist": edit_distance_fn,
|
103 |
"bleu": bleu_fn,
|
104 |
"meteor": meteor_fn,
|
|
|
126 |
tqdm.pandas()
|
127 |
|
128 |
def apply_metric_fn_to_row(row, fn, col_pred, col_ref):
|
129 |
+
return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods']))
|
130 |
|
131 |
for metric in REL_METRICS:
|
132 |
print(f"Computing {metric} for the related pairs")
|