File size: 1,490 Bytes
e157bd5
 
63a1401
 
 
 
 
 
 
e157bd5
 
 
63a1401
 
 
 
e157bd5
63a1401
 
e157bd5
 
d24f6e8
63a1401
 
e157bd5
 
63a1401
 
 
e157bd5
 
 
 
63a1401
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
    """
    Formats the evaluation results into a structured dictionary.

    Args:
        model_name (str): The name of the evaluated model.
        revision (str): The revision hash of the model.
        precision (str): The precision with which the evaluation was run.
        overall_js (float): The overall average JS divergence.
        overall_ci (tuple): The confidence interval for the overall JS divergence.
        experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...).

    Returns:
        dict: A dictionary containing the structured evaluation results.
    """
    # Initialize the base structure
    results = {
        "config": {
            "model_dtype": precision,  # Precision with which you ran the evaluation
            "model_name": model_name,  # Name of the model
            #"model_sha": revision      # Hash of the model
        },
        "results": {
            "overall_js_divergence": overall_js,          # Overall JS divergence
            "overall_confidence_interval": overall_ci,    # Confidence interval for the overall JS divergence
        }
    }

    # Add experiment-specific results to the dictionary
    for exp_name, score in experiment_scores.items():
        results["results"][exp_name] = score  # Add each experiment score and its CI

    return results