Spaces:

XufengDuan
/

HumanLikeness

Sleeping

update scripts

d24f6e8 3 months ago

1.49 kB


	def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
	"""
	Formats the evaluation results into a structured dictionary.

	Args:
	model_name (str): The name of the evaluated model.
	revision (str): The revision hash of the model.
	precision (str): The precision with which the evaluation was run.
	overall_js (float): The overall average JS divergence.
	overall_ci (tuple): The confidence interval for the overall JS divergence.
	experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...).

	Returns:
	dict: A dictionary containing the structured evaluation results.
	"""
	# Initialize the base structure
	results = {
	"config": {
	"model_dtype": precision, # Precision with which you ran the evaluation
	"model_name": model_name, # Name of the model
	#"model_sha": revision # Hash of the model
	},
	"results": {
	"overall_js_divergence": overall_js, # Overall JS divergence
	"overall_confidence_interval": overall_ci, # Confidence interval for the overall JS divergence
	}
	}

	# Add experiment-specific results to the dictionary
	for exp_name, score in experiment_scores.items():
	results["results"][exp_name] = score # Add each experiment score and its CI

	return results