Spaces:
Sleeping
Sleeping
import evaluate | |
from datasets import Features, Value | |
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix | |
_CITATION = """ | |
@article{scikit-learn, | |
title={Scikit-learn: Machine Learning in {P}ython}, | |
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. | |
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. | |
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and | |
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, | |
journal={Journal of Machine Learning Research}, | |
volume={12}, | |
pages={2825--2830}, | |
year={2011} | |
} | |
""" | |
_DESCRIPTION = """ | |
This evaluator computes multiple classification metrics to assess the performance of a model. Metrics calculated include: | |
- Accuracy: The proportion of correct predictions among the total number of cases processed. Computed as (TP + TN) / (TP + TN + FP + FN), where TP, TN, FP, and FN denote true positives, true negatives, false positives, and false negatives respectively. | |
- Precision, Recall, and F1-Score: Evaluated for each class individually as well as macro (average across classes) and micro (aggregate contributions of all classes) averages. | |
- Confusion Matrix: A matrix representing the classification accuracy for each class combination. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Args: | |
predictions (`list` of `str`): Predicted labels. | |
references (`list` of `str`): Ground truth labels. | |
Returns: | |
Returns: | |
Dict containing: | |
accuracy (float): Proportion of correct predictions. Value ranges between 0 (worst) and 1 (best). | |
precision_macro (float), recall_macro (float), f1_macro (float): Macro averages of precision, recall, and F1-score respectively. | |
precision_micro (float), recall_micro (float), f1_micro (float): Micro averages of precision, recall, and F1-score respectively. | |
confusion_matrix (list of lists): 2D list representing the confusion matrix of the classification results. | |
""" | |
class ClassificationEvaluator(evaluate.Metric): | |
def _info(self): | |
return evaluate.MetricInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=Features( | |
{"predictions": Value("string"), "references": Value("string")} | |
), | |
) | |
def _compute(self, predictions, references, **eval_kwargs): | |
accuracy = accuracy_score(references, predictions, normalize=True, sample_weight=None) | |
# Calculate macro and micro averages for precision, recall, and F1-score | |
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support( | |
references, predictions, average='macro' | |
) | |
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support( | |
references, predictions, average='micro' | |
) | |
# Calculate the confusion matrix | |
conf_matrix = confusion_matrix(references, predictions) | |
return { | |
"accuracy": accuracy, | |
"precision_macro": float(precision_macro), | |
"recall_macro": float(recall_macro), | |
"f1_macro": float(f1_macro), | |
"precision_micro": float(precision_micro), | |
"recall_micro": float(recall_micro), | |
"f1_micro": float(f1_micro), | |
"confusion_matrix": conf_matrix.tolist() | |
} | |