File size: 3,998 Bytes
63a1401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e157bd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63a1401
 
 
 
 
 
 
e157bd5
 
 
63a1401
 
 
 
e157bd5
63a1401
 
e157bd5
 
 
63a1401
 
e157bd5
 
63a1401
 
 
e157bd5
 
 
 
63a1401
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def is_summary_valid(summary: str) -> bool:
    """
    Checks if the summary is valid.

    A summary is valid if it is not empty and contains at least five words. 
    
    Args:
        summary (str): The summary to check.

    Returns:
        bool: True if the summary is valid, False otherwise.
    """
    if isinstance(summary, str):
        words = summary.split()
        if len(words) >= 5:
            return True
    # print(summary)
    return False


def create_pairs(df):
    """
    Creates pairs of source and summary from the dataframe.

    Args:
        df (DataFrame): The dataframe containing source and summary columns.

    Returns:
        list: A list of pairs [source, summary].
    """
    pairs = []
    for _, row in df.iterrows():
        pairs.append([row['source'], row['summary']])

    return pairs


# def format_results(model_name: str, revision: str, precision: str,
#                 factual_consistency_rate: float, hallucination_rate: float,
#                 answer_rate: float, avg_summary_len: float) -> dict:
#     """
#     Formats the evaluation results into a structured dictionary.
#
#     Args:
#         model_name (str): The name of the evaluated model.
#         revision (str): The revision hash of the model.
#         precision (str): The precision with which the evaluation was run.
#         factual_consistency_rate (float): The factual consistency rate.
#         hallucination_rate (float): The hallucination rate.
#         answer_rate (float): The answer rate.
#         avg_summary_len (float): The average summary length.
#
#     Returns:
#         dict: A dictionary containing the structured evaluation results.
#     """
#     results = {
#         "config": {
#             "model_dtype": precision, # Precision with which you ran the evaluation
#             "model_name": model_name, # Name of the model
#             "model_sha": revision # Hash of the model
#         },
#         "results": {
#             "hallucination_rate": {
#                 "hallucination_rate": round(hallucination_rate,3)
#             },
#             "factual_consistency_rate": {
#                 "factual_consistency_rate": round(factual_consistency_rate,1)
#             },
#             "answer_rate": {
#                 "answer_rate": round(answer_rate*100,1)
#             },
#             "average_summary_length": {
#                 "average_summary_length": round(avg_summary_len,1)
#             },
#         }
#     }
#
#     return results

def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
    """
    Formats the evaluation results into a structured dictionary.

    Args:
        model_name (str): The name of the evaluated model.
        revision (str): The revision hash of the model.
        precision (str): The precision with which the evaluation was run.
        overall_js (float): The overall average JS divergence.
        overall_ci (tuple): The confidence interval for the overall JS divergence.
        experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...).

    Returns:
        dict: A dictionary containing the structured evaluation results.
    """
    # Initialize the base structure
    results = {
        "config": {
            "model_dtype": precision,  # Precision with which you ran the evaluation
            "model_name": model_name,  # Name of the model
            "model_sha": revision      # Hash of the model
        },
        "results": {
            "overall_js_divergence": overall_js,          # Overall JS divergence
            "overall_confidence_interval": overall_ci,    # Confidence interval for the overall JS divergence
        }
    }

    # Add experiment-specific results to the dictionary
    for exp_name, score in experiment_scores.items():
        results["results"][exp_name] = score  # Add each experiment score and its CI

    return results