|
import re |
|
|
|
""" |
|
number prediction |
|
metric: accuracy |
|
ιι’ζε |
|
""" |
|
def compute_jetq(data_dict): |
|
""" |
|
Compute the Accuracy |
|
we extract the total amount of cost involved in the crime from the prediction and compare it with the reference |
|
The prediction is correct if |
|
the total amount of cost provided in the reference, appears in the prediction. |
|
""" |
|
score_list, abstentions = [], 0 |
|
|
|
for example in data_dict: |
|
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] |
|
assert answer.startswith("δΈζζΆεε°ηη―η½ͺιι’:"), f"answer: {answer}, question: {question}" |
|
assert answer.endswith("ε
γ"), f"answer: {answer}, question: {question}" |
|
answer = answer.replace("δΈζζΆεε°ηη―η½ͺιι’:", "") |
|
|
|
assert "εε
" not in answer, f"answer: {answer}, question: {question}" |
|
assert "δΈ" not in answer, f"answer: {answer}, question: {question}" |
|
|
|
|
|
answer = answer.replace("ε
γ", "") |
|
answer = float(answer) |
|
|
|
prediction_digits = re.findall(r"\d+\.?\d*", prediction) |
|
prediction_digits = [float(digit) for digit in prediction_digits] |
|
|
|
if len(prediction_digits) == 0: |
|
abstentions += 1 |
|
if answer in prediction_digits: |
|
score_list.append(1) |
|
else: |
|
score_list.append(0) |
|
|
|
|
|
|
|
accuracy = sum(score_list) / len(score_list) |
|
return {"score": accuracy, "abstention_rate": abstentions/len(data_dict)} |
|
|