File size: 4,313 Bytes
ab76bab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import json
import os
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
from src.envs import EVAL_RESULTS_PATH
def parse_first_word(answer):
# Extract the first word and check if it's 'yes' or 'no'
first_word = re.split(r'[\s,\.]', answer.lower())[0]
if first_word.startswith('yes'):
return 'yes'
elif first_word.startswith('no'):
return 'no'
else:
return None
def compute_metrics(true_labels, predicted_labels):
# Filtering out invalid answers
valid_indices = [i for i, label in enumerate(predicted_labels) if label in ['yes', 'no']]
filtered_true_labels = [true_labels[i] for i in valid_indices]
filtered_predicted_labels = [predicted_labels[i] for i in valid_indices]
# Calculating metrics
accuracy = accuracy_score(filtered_true_labels, filtered_predicted_labels)
precision, recall, f1_score, _ = precision_recall_fscore_support(
filtered_true_labels, filtered_predicted_labels, average='binary', pos_label='yes')
yes_ratio = filtered_predicted_labels.count('yes') / len(filtered_predicted_labels) if filtered_predicted_labels else 0
return {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1 Score": f1_score,
"Yes Ratio": yes_ratio
}
def aggregate_metrics(directory_path):
metrics_data = {"random": {"true": [], "pred": [], "invalid": []},
"popular": {"true": [], "pred": [], "invalid": []},
"adversarial": {"true": [], "pred": [], "invalid": []}}
# Process each file in the directory
for filename in os.listdir(directory_path):
if filename.endswith(".json"):
file_path = os.path.join(directory_path, filename)
with open(file_path, 'r') as f:
data = json.load(f)
question_type = filename.split('_')[0]
if question_type in metrics_data:
for entry in data[next(iter(data))]:
first_word = parse_first_word(entry['predicted_answer'])
if first_word is None:
metrics_data[question_type]["invalid"].append(entry['predicted_answer'])
metrics_data[question_type]["true"].append(entry['ground_truth_answer'].lower())
metrics_data[question_type]["pred"].append(first_word if first_word else entry['predicted_answer'].lower())
results = {}
for q_type, data in metrics_data.items():
result = compute_metrics(data["true"], data["pred"])
result["Non-Binary Responses Count"] = len(data["invalid"])
result["Non-Binary Responses"] = data["invalid"]
results[q_type] = result
return results
def transform_format(data, model_name):
# Define the new format's base structure
transformed_data = {
"config": {
"model_name": model_name
},
"results": {}
}
# Mapping of old keys to new keys
key_mapping = {
"Accuracy": "accuracy",
"Precision": "precision",
"Recall": "recall",
"F1 Score": "f1_score",
"Yes Ratio": "yes_percentage"
}
# Iterate over each item in the original data
for model_type, metrics in data.items():
for old_key, new_suffix in key_mapping.items():
# Format the new key according to the required format 2 style
new_key = f"{model_type}_{new_suffix}"
# Assign the corresponding value to the new key in the results dictionary
transformed_data["results"][new_key] = {
new_key: round(metrics[old_key], 4) if isinstance(metrics[old_key], float) else metrics[old_key]
}
return transformed_data
def calculate_metrics(json_output_directory, model_name):
final_metrics = aggregate_metrics(json_output_directory)
transformed_metrics = transform_format(final_metrics, model_name)
# write to a file
results_path = os.path.join(EVAL_RESULTS_PATH, '3d-pope', model_name)
if not os.path.exists(results_path):
os.makedirs(results_path)
with open(os.path.join(results_path, 'results.json'), 'w') as f:
json.dump(transformed_metrics, f, indent=4)
print(json.dumps(final_metrics, indent=4)) |