def is_summary_valid(summary: str) -> bool: """ Checks if the summary is valid. A summary is valid if it is not empty and contains at least five words. Args: summary (str): The summary to check. Returns: bool: True if the summary is valid, False otherwise. """ if isinstance(summary, str): words = summary.split() if len(words) >= 5: return True # print(summary) return False def create_pairs(df): """ Creates pairs of source and summary from the dataframe. Args: df (DataFrame): The dataframe containing source and summary columns. Returns: list: A list of pairs [source, summary]. """ pairs = [] for _, row in df.iterrows(): pairs.append([row['source'], row['summary']]) return pairs # def format_results(model_name: str, revision: str, precision: str, # factual_consistency_rate: float, hallucination_rate: float, # answer_rate: float, avg_summary_len: float) -> dict: # """ # Formats the evaluation results into a structured dictionary. # # Args: # model_name (str): The name of the evaluated model. # revision (str): The revision hash of the model. # precision (str): The precision with which the evaluation was run. # factual_consistency_rate (float): The factual consistency rate. # hallucination_rate (float): The hallucination rate. # answer_rate (float): The answer rate. # avg_summary_len (float): The average summary length. # # Returns: # dict: A dictionary containing the structured evaluation results. # """ # results = { # "config": { # "model_dtype": precision, # Precision with which you ran the evaluation # "model_name": model_name, # Name of the model # "model_sha": revision # Hash of the model # }, # "results": { # "hallucination_rate": { # "hallucination_rate": round(hallucination_rate,3) # }, # "factual_consistency_rate": { # "factual_consistency_rate": round(factual_consistency_rate,1) # }, # "answer_rate": { # "answer_rate": round(answer_rate*100,1) # }, # "average_summary_length": { # "average_summary_length": round(avg_summary_len,1) # }, # } # } # # return results def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict: """ Formats the evaluation results into a structured dictionary. Args: model_name (str): The name of the evaluated model. revision (str): The revision hash of the model. precision (str): The precision with which the evaluation was run. overall_js (float): The overall average JS divergence. overall_ci (tuple): The confidence interval for the overall JS divergence. experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...). Returns: dict: A dictionary containing the structured evaluation results. """ # Initialize the base structure results = { "config": { "model_dtype": precision, # Precision with which you ran the evaluation "model_name": model_name, # Name of the model "model_sha": revision # Hash of the model }, "results": { "overall_js_divergence": overall_js, # Overall JS divergence "overall_confidence_interval": overall_ci, # Confidence interval for the overall JS divergence } } # Add experiment-specific results to the dictionary for exp_name, score in experiment_scores.items(): results["results"][exp_name] = score # Add each experiment score and its CI return results