XufengDuan's picture
update scripts
d24f6e8
raw
history blame
1.49 kB
def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
"""
Formats the evaluation results into a structured dictionary.
Args:
model_name (str): The name of the evaluated model.
revision (str): The revision hash of the model.
precision (str): The precision with which the evaluation was run.
overall_js (float): The overall average JS divergence.
overall_ci (tuple): The confidence interval for the overall JS divergence.
experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...).
Returns:
dict: A dictionary containing the structured evaluation results.
"""
# Initialize the base structure
results = {
"config": {
"model_dtype": precision, # Precision with which you ran the evaluation
"model_name": model_name, # Name of the model
#"model_sha": revision # Hash of the model
},
"results": {
"overall_js_divergence": overall_js, # Overall JS divergence
"overall_confidence_interval": overall_ci, # Confidence interval for the overall JS divergence
}
}
# Add experiment-specific results to the dictionary
for exp_name, score in experiment_scores.items():
results["results"][exp_name] = score # Add each experiment score and its CI
return results