XufengDuan's picture
update scripts
e157bd5
raw
history blame
4 kB
def is_summary_valid(summary: str) -> bool:
"""
Checks if the summary is valid.
A summary is valid if it is not empty and contains at least five words.
Args:
summary (str): The summary to check.
Returns:
bool: True if the summary is valid, False otherwise.
"""
if isinstance(summary, str):
words = summary.split()
if len(words) >= 5:
return True
# print(summary)
return False
def create_pairs(df):
"""
Creates pairs of source and summary from the dataframe.
Args:
df (DataFrame): The dataframe containing source and summary columns.
Returns:
list: A list of pairs [source, summary].
"""
pairs = []
for _, row in df.iterrows():
pairs.append([row['source'], row['summary']])
return pairs
# def format_results(model_name: str, revision: str, precision: str,
# factual_consistency_rate: float, hallucination_rate: float,
# answer_rate: float, avg_summary_len: float) -> dict:
# """
# Formats the evaluation results into a structured dictionary.
#
# Args:
# model_name (str): The name of the evaluated model.
# revision (str): The revision hash of the model.
# precision (str): The precision with which the evaluation was run.
# factual_consistency_rate (float): The factual consistency rate.
# hallucination_rate (float): The hallucination rate.
# answer_rate (float): The answer rate.
# avg_summary_len (float): The average summary length.
#
# Returns:
# dict: A dictionary containing the structured evaluation results.
# """
# results = {
# "config": {
# "model_dtype": precision, # Precision with which you ran the evaluation
# "model_name": model_name, # Name of the model
# "model_sha": revision # Hash of the model
# },
# "results": {
# "hallucination_rate": {
# "hallucination_rate": round(hallucination_rate,3)
# },
# "factual_consistency_rate": {
# "factual_consistency_rate": round(factual_consistency_rate,1)
# },
# "answer_rate": {
# "answer_rate": round(answer_rate*100,1)
# },
# "average_summary_length": {
# "average_summary_length": round(avg_summary_len,1)
# },
# }
# }
#
# return results
def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
"""
Formats the evaluation results into a structured dictionary.
Args:
model_name (str): The name of the evaluated model.
revision (str): The revision hash of the model.
precision (str): The precision with which the evaluation was run.
overall_js (float): The overall average JS divergence.
overall_ci (tuple): The confidence interval for the overall JS divergence.
experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...).
Returns:
dict: A dictionary containing the structured evaluation results.
"""
# Initialize the base structure
results = {
"config": {
"model_dtype": precision, # Precision with which you ran the evaluation
"model_name": model_name, # Name of the model
"model_sha": revision # Hash of the model
},
"results": {
"overall_js_divergence": overall_js, # Overall JS divergence
"overall_confidence_interval": overall_ci, # Confidence interval for the overall JS divergence
}
}
# Add experiment-specific results to the dictionary
for exp_name, score in experiment_scores.items():
results["results"][exp_name] = score # Add each experiment score and its CI
return results