import logging import pandas as pd import os import csv import src.envs as envs from src.backend.model_operations import ResponseGenerator, EvaluationModel import src.backend.util as util logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class Evaluator: """A class to evaluate summaries generated by a language model. Attributes: model (str): The name or path of the model. revision (str): The model revision. precision (str): The precision setting of the model. num_fewshot (int): Number of few-shot examples to use. batch_size (int): Batch size for processing. device (str): The device to run the model on. no_cache (bool): Flag to disable caching. limit (int): Limit on the number of items to process. write_out (bool): Whether to write results to a file. output_base_path (str): Base path for output files. response_generator (ResponseGenerator): Instance for generating summaries. eval_model (EvaluationModel): Instance for evaluating summaries. """ def __init__(self, model, revision, precision, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs'): """Initializes the Evaluator with the given model and settings. Args: model (str): The name or path of the model. revision (str): The model revision. precision (str): The precision setting of the model. num_fewshot (int): Number of few-shot examples to use. batch_size (int): Batch size for processing. device (str): The device to run the model on. no_cache (bool): Flag to disable caching. limit (int): Limit on the number of items to process. write_out (bool): Whether to write results to a file. output_base_path (str): Base path for output files. """ self.model = model self.revision = revision self.precision = precision self.batch_size = batch_size self.device = device self.no_cache = no_cache self.limit = limit self.write_out = write_out self.output_base_path = output_base_path try: self.response_generator = ResponseGenerator(model, revision) self.eval_model = EvaluationModel() except Exception as e: logging.error(f"Error initializing Evaluator: {e}") raise def evaluate(self): """ Performs the evaluation process by generating summaries and computing metrics. Returns: dict: A dictionary containing evaluation results. """ try: from openpyxl import load_workbook # df = load_workbook(filename=envs.DATASET_PATH) df_prompt = load_workbook(filename=envs.PROMPT_PATH) # df = pd.read_excel(envs.DATASET_PATH, engine='xlrd') #读取原数据,原始数据,本项目这里应该是问题 # df_prompt = pd.read_excel(envs.PROMPT_PATH, engine='xlrd') # df_prompt = pd.read_csv(envs.PROMPT_PATH) # print(envs.DATASET_PATH) # print(df.shape) # print(df.iloc[-1]) self.generated_responses_df = self.response_generator.generate_response(envs.DATASET_PATH, df_prompt, save_path=f"./generation_results/{self.model}.csv") # exit() # avg_response_len = self.response_generator.avg_length # answer_rate = self.response_generator.answer_rate envs.API.upload_file( path_or_fileobj=f"./generation_results/{self.model}.csv", path_in_repo=f"{self.model}.csv", repo_id=envs.RESULTS_REPO, repo_type="dataset", ) '''开始评估模型的结果''' self.humanlike = self.eval_model.evaluate_humanlike(self.generated_responses_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv") all_results = self.humanlike # Prepare individual experiment scores and CIs experiment_results = {} for exp, data in all_results['per_experiment'].items(): experiment_results[f'{exp}'] = data['average_js_divergence'] experiment_results[f'{exp}_ci'] = data['confidence_interval'] # Write results into results using util.format_results results = util.format_results( model_name=self.model, revision=self.revision, precision=self.precision, overall_js=all_results['overall']['average_js_divergence'], overall_ci=all_results['overall']['confidence_interval'], **experiment_results # Unpack the experiment results ) return results except FileNotFoundError: logging.error(f"File not found: {envs.DATASET_PATH}") raise except Exception as e: logging.error(f"Error during evaluation: {e}") raise def write_results(self): print('Updating result files') leaderboard_path = os.getcwd() # the path of leaderboard folder print(leaderboard_path) working_path = os.path.join(leaderboard_path, 'Humanlike Leaderboard Results') if not os.path.exists(working_path): logging.error(f"Need to first download the results from google drive to the learderboard folder") raise source_response_df = self.generated_responses_df[["user_prompt", "response"]] # #update leaderboard_responses.csv # #first remove previous results for the current model # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses.csv'), encoding='utf-8', sep="\t") # mask = existing_df['model'] == self.model # existing_df = existing_df[~mask] # # get new result leaderboard_responses_df = source_response_df leaderboard_responses_df.insert(2, "model", [self.model]*leaderboard_responses_df.shape[0]) leaderboard_responses_df.to_csv(os.path.join(working_path, 'leaderboard_responses.csv'), mode='a', index=False, header=False) print('leaderboard_responses.csv has been updated') # update leaderboard_responses_with_scores.csv # BUG: get error when opening the file # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'), # encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2) # print(existing_df.shape) # mask = existing_df['model'] == self.model # existing_df = existing_df[~mask] # get new result leaderboard_responses_with_scores_df = pd.DataFrame.from_dict(self.eval_results) leaderboard_responses_with_scores_df.insert(3, "model", [self.model]*leaderboard_responses_with_scores_df.shape[0]) leaderboard_responses_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'), mode='a', index=False, header=False) print('leaderboard_responses_with_scores.csv has been updated')