Spaces:

XufengDuan
/

HumanLikeness

Sleeping

App Files Files Community

XufengDuan commited on Aug 23

Commit

d24f6e8

•

1 Parent(s): c0bd00c

update scripts

Browse files

Files changed (8) hide show

src/backend/evaluate_model.py +21 -38
src/backend/model_operations.py +187 -740
src/backend/util.py +1 -79
src/display/about.py +19 -36
src/display/utils.py +1 -1
src/envs.py +0 -2
src/leaderboard/read_evals.py +3 -4
src/populate.py +0 -1

src/backend/evaluate_model.py CHANGED Viewed

@@ -5,7 +5,7 @@ import csv
 import src.envs as envs
-from src.backend.model_operations import SummaryGenerator, EvaluationModel
 import src.backend.util as util
 logging.basicConfig(level=logging.INFO,
@@ -26,7 +26,7 @@ class Evaluator:
         limit (int): Limit on the number of items to process.
         write_out (bool): Whether to write results to a file.
         output_base_path (str): Base path for output files.
-        summary_generator (SummaryGenerator): Instance for generating summaries.
         eval_model (EvaluationModel): Instance for evaluating summaries.
     """
     def __init__(self, model, revision, precision, batch_size,
@@ -56,8 +56,8 @@ class Evaluator:
         self.write_out = write_out
         self.output_base_path = output_base_path
         try:
-            self.summary_generator = SummaryGenerator(model, revision)
-            self.eval_model = EvaluationModel(envs.HEM_PATH)
         except Exception as e:
             logging.error(f"Error initializing Evaluator: {e}")
             raise
@@ -81,10 +81,10 @@ class Evaluator:
             # print(envs.DATASET_PATH)
             # print(df.shape)
             # print(df.iloc[-1])
-            self.generated_summaries_df = self.summary_generator.generate_summaries(envs.DATASET_PATH, df_prompt, save_path=f"./generation_results/{self.model}.csv")
             # exit()
-            # avg_summary_len = self.summary_generator.avg_length
-            # answer_rate = self.summary_generator.answer_rate
             envs.API.upload_file(
                 path_or_fileobj=f"./generation_results/{self.model}.csv",
                 path_in_repo=f"{self.model}.csv",
@@ -93,7 +93,7 @@ class Evaluator:
             )
             '''开始评估模型的结果'''
-            self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv")
             all_results = self.humanlike
             # Prepare individual experiment scores and CIs
@@ -111,23 +111,6 @@ class Evaluator:
                 overall_ci=all_results['overall']['confidence_interval'],
                 **experiment_results  # Unpack the experiment results
             )
-            '''原始指标'''
-            # self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
-                # self.generated_summaries_df)
-            # factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
-            # hallucination_rate = self.eval_model.hallucination_rate
-            # factual_consistency_rate = 0
-            # answer_rate = 0
-            # avg_summary_len = 0
-            #
-            # results = util.format_results(model_name=self.model, revision=self.revision,
-            #                             precision=self.precision,
-            #                             factual_consistency_rate=factual_consistency_rate,
-            #                             hallucination_rate=self.humanlike,
-            #                             answer_rate=answer_rate,
-            #                             avg_summary_len=avg_summary_len)
             return results
         except FileNotFoundError:
             logging.error(f"File not found: {envs.DATASET_PATH}")
@@ -145,28 +128,28 @@ class Evaluator:
             logging.error(f"Need to first download the results from google drive to the learderboard folder")
             raise
-        source_summary_df = self.generated_summaries_df[["user_prompt", "response"]]
-        # #update leaderboard_summaries.csv
         # #first remove previous results for the current model
-        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8', sep="\t")
         # mask = existing_df['model'] == self.model
         # existing_df = existing_df[~mask]
         # # get new result
-        leaderboard_summaries_df = source_summary_df
-        leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
-        leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
-        print('leaderboard_summaries.csv has been updated')
-        # update leaderboard_summaries_with_scores.csv
         # BUG: get error when opening the file
-        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
         #                         encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
         # print(existing_df.shape)
         # mask = existing_df['model'] == self.model
         # existing_df = existing_df[~mask]
         # get new result
-        leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
-        leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
-        leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
-        print('leaderboard_summaries_with_scores.csv has been updated')

 import src.envs as envs
+from src.backend.model_operations import ResponseGenerator, EvaluationModel
 import src.backend.util as util
 logging.basicConfig(level=logging.INFO,
         limit (int): Limit on the number of items to process.
         write_out (bool): Whether to write results to a file.
         output_base_path (str): Base path for output files.
+        response_generator (ResponseGenerator): Instance for generating summaries.
         eval_model (EvaluationModel): Instance for evaluating summaries.
     """
     def __init__(self, model, revision, precision, batch_size,
         self.write_out = write_out
         self.output_base_path = output_base_path
         try:
+            self.response_generator = ResponseGenerator(model, revision)
+            self.eval_model = EvaluationModel()
         except Exception as e:
             logging.error(f"Error initializing Evaluator: {e}")
             raise
             # print(envs.DATASET_PATH)
             # print(df.shape)
             # print(df.iloc[-1])
+            self.generated_responses_df = self.response_generator.generate_response(envs.DATASET_PATH, df_prompt, save_path=f"./generation_results/{self.model}.csv")
             # exit()
+            # avg_response_len = self.response_generator.avg_length
+            # answer_rate = self.response_generator.answer_rate
             envs.API.upload_file(
                 path_or_fileobj=f"./generation_results/{self.model}.csv",
                 path_in_repo=f"{self.model}.csv",
             )
             '''开始评估模型的结果'''
+            self.humanlike = self.eval_model.evaluate_humanlike(self.generated_responses_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv")
             all_results = self.humanlike
             # Prepare individual experiment scores and CIs
                 overall_ci=all_results['overall']['confidence_interval'],
                 **experiment_results  # Unpack the experiment results
             )
             return results
         except FileNotFoundError:
             logging.error(f"File not found: {envs.DATASET_PATH}")
             logging.error(f"Need to first download the results from google drive to the learderboard folder")
             raise
+        source_response_df = self.generated_responses_df[["user_prompt", "response"]]
+        # #update leaderboard_responses.csv
         # #first remove previous results for the current model
+        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses.csv'), encoding='utf-8', sep="\t")
         # mask = existing_df['model'] == self.model
         # existing_df = existing_df[~mask]
         # # get new result
+        leaderboard_responses_df = source_response_df
+        leaderboard_responses_df.insert(2, "model", [self.model]*leaderboard_responses_df.shape[0])
+        leaderboard_responses_df.to_csv(os.path.join(working_path, 'leaderboard_responses.csv'), mode='a', index=False, header=False)
+        print('leaderboard_responses.csv has been updated')
+        # update leaderboard_responses_with_scores.csv
         # BUG: get error when opening the file
+        # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'),
         #                         encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
         # print(existing_df.shape)
         # mask = existing_df['model'] == self.model
         # existing_df = existing_df[~mask]
         # get new result
+        leaderboard_responses_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
+        leaderboard_responses_with_scores_df.insert(3, "model", [self.model]*leaderboard_responses_with_scores_df.shape[0])
+        leaderboard_responses_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'), mode='a', index=False, header=False)
+        print('leaderboard_responses_with_scores.csv has been updated')

src/backend/model_operations.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pathlib import Path
 import requests
 import json
-import numpy as np
 import pandas as pd
 import spacy
 from sentence_transformers import CrossEncoder
@@ -43,7 +43,7 @@ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=Tr
 # Load spacy model for word tokenization
 # nlp = spacy.load("en_core_web_sm")
 try:
-    nlp1 = spacy.load("en_core_web_sm")
 except OSError:
     print("无法加载模型，继续执行其他处理。")
@@ -55,22 +55,6 @@ logging.basicConfig(level=logging.INFO,
-# os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
-def load_evaluation_model(model_path):
-    """Load the evaluation model from the given path
-    Args:
-        model_path (str): Path to the evaluation model
-    Returns:
-        CrossEncoder: The evaluation model
-    """
-    # model = CrossEncoder(model_path)
-    model = ""
-    return model
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
@@ -85,21 +69,21 @@ class ModelLoadingException(Exception):
         super().__init__(f"{messages} id={model_id} revision={revision}")
-class SummaryGenerator:
-    """A class to generate summaries using a causal language model.
     Attributes:
         model (str): huggingface/{model_id}
         api_base (str): https://api-inference.huggingface.co/models/{model_id}
-        summaries_df (DataFrame): DataFrame to store generated summaries.
         revision (str): Model revision.
-        avg_length (float): Average length of summaries.
-        answer_rate (float): Rate of non-empty summaries.
     """
     def __init__(self, model_id, revision):
         """
-        Initializes the SummaryGenerator with a model.
         Args:
             model_id (str): Identifier for the model.
@@ -108,29 +92,28 @@ class SummaryGenerator:
         self.model_id = model_id
         self.model = f"huggingface/{model_id}"
         self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
-        self.summaries_df = pd.DataFrame()
         self.revision = revision
         self.avg_length = None
         self.answer_rate = None
         self.exceptions = None
         self.local_model = None
-    def generate_summaries(self, dataset, df_prompt, save_path=None):
-        """Generate summaries for a given DataFrame of source docs.
-           修改这里拉取模型生成结果
         Args:
             dataset (DataFrame): DataFrame containing source docs.
         Returns:
-            summaries_df (DataFrame): Generated summaries by the model.
         """
         exceptions = []
         if (save_path is not None) and os.path.exists(save_path):
             '''已存在文件,可以读取已经存在的测试文本'''
-            self.summaries_df = pd.read_csv(save_path)
-            # print(self.summaries_df['Experiment'])
-            print(f'Loaded generated summaries from {save_path}')
         else:
             '''测试文件不存在，则需要调用指定的模型来进行测试'''
             # prompt = {}
@@ -193,9 +176,9 @@ class SummaryGenerator:
                         while True:
                             try:
                                 '''调用'''
-                                print(ID,'-',j,'-',ii)
-                                _response = self.generate_summary(system_prompt, _user_prompt)
                                 # print(f"Finish index {index}")
                                 break
                             except Exception as e:
@@ -221,7 +204,7 @@ class SummaryGenerator:
                                         print(f"Error at index {i}: {e}")
                                         time.sleep(wait_time)
                                         try:
-                                            _response = self.generate_summary(system_prompt, _user_prompt)
                                             break
                                         except Exception as ee:
                                             exceptions.append(ee)
@@ -236,45 +219,46 @@ class SummaryGenerator:
                             break
                         if  i == 5:
                             #print(_response)
                             def extract_responses(text, trigger_words=None):
                                 if trigger_words is None:
                                     trigger_words = ["sure", "okay", "yes"]
                                 try:
                                     sentences = text.split('\n')
                                     sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
                                     sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
                                                  sentence in sentences]
-                                    if any(sentences[0].lower().startswith(word) for word in trigger_words) and len(sentences)>2:
                                         _response1 = sentences[1].strip() if len(sentences) > 1 else None
                                         _response2 = sentences[2].strip() if len(sentences) > 2 else None
                                     else:
                                         _response1 = sentences[0].strip() if len(sentences) > 0 else None
                                         _response2 = sentences[1].strip() if len(sentences) > 1 else None
                                 except Exception as e:
                                     print(f"Error occurred: {e}")
                                     _response1, _response2 = None, None
                                 return _response1, _response2
                             _response1, _response2 = extract_responses(_response)
-                            # if _response == None:
-                            #     _response1, _response2 = "", ""
-                            # else:
-                            #     try:
-                            #         import re
-                            #         _response1,_response2 = re.split(r'\n\s*\n', _response.strip())
-                            #     except:
-                            #         _response1 = _response.split('\n\n')
-                            #         if len(_response) == 2:
-                            #             _response1, _response2 = _response[0], _response[1]
-                            #         else:
-                            #             _response1, _response2 = _response[0], ""
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
@@ -309,30 +293,26 @@ class SummaryGenerator:
                                 Stimuli_1.append(Stimuli_1_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
-                            #print(_response)
-                        # exit()
                     # Sleep to prevent hitting rate limits too frequently
                         time.sleep(1)
-            self.summaries_df = pd.DataFrame(list(zip(Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1)),
-                                            columns=["Experiment", "Question_ID", "Item", "Condition", "User_prompt", "Response","Factor 2","Stimuli 1"])
             if save_path is not None:
-                print(f'Save summaries to {save_path}')
                 fpath = Path(save_path)
                 fpath.parent.mkdir(parents=True, exist_ok=True)
-                self.summaries_df.to_csv(fpath)
         self.exceptions = exceptions
         # self._compute_avg_length()
         # self._compute_answer_rate()
-        return self.summaries_df
-    def generate_summary(self, system_prompt: str, user_prompt: str):
         # Using Together AI API
         using_together_api = False
         together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
@@ -354,17 +334,9 @@ class SummaryGenerator:
                 "model": self.model_id,
                 # "max_tokens": 4096,
                 'max_new_tokens': 100,
-                # "temperature": 0.0,
                 # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
             }
-            # if 'mixtral' in self.model_id.lower():
-            #     # payload['prompt'] = user_prompt
-            #     # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
-            #     payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
-            #     print(payload)
-            # else:
-            #     payload['messages'] = [{"role": "system", "content": system_prompt},
-            #                             {"role": "user", "content": user_prompt}]
             payload['messages'] = [{"role": "system", "content": system_prompt},
                                         {"role": "user", "content": user_prompt}]
             headers = {
@@ -462,82 +434,13 @@ class SummaryGenerator:
                     continue
             raise Exception("All tokens failed.")
-#             print(self.model_id)
-#             print(self.api_base)
-#             mistralai/Mistral-7B-Instruct-v0.1
-# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
-            # Using HF API or download checkpoints
-            # try:  # try use HuggingFace API
-            #     from huggingface_hub import InferenceClient
-            #     print("token_for_request:",envs.TOKEN)
-            #     print(self.model_id)
-            #     client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
-            #     messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
-            #     # outputs = client.chat_completion(messages, max_tokens=100)
-            #     result = None
-            #     while result is None:
-            #         outputs = client.chat_completion(messages, max_tokens=100)
-            #         result = outputs['choices'][0]['message']['content']
-            #
-            #         if result is None:
-            #             time.sleep(1)  # Optional: Add a small delay before retrying
-            #
-            #     return result
-            #
-            # except Exception as e:
-            #     print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
-            #     try:
-            #         client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
-            #         messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
-            #         result = None
-            #         while result is None:
-            #             outputs = client.chat_completion(messages, max_tokens=100)
-            #             result = outputs['choices'][0]['message']['content']
-            #
-            #             if result is None:
-            #                 time.sleep(1)  # Optional: Add a small delay before retrying
-            #
-            #         return result
-            #     except Exception as ee:
-            #         print(f"Error with TOKEN1: {envs.TOKEN1}")
-            #         raise ee
-            # except: # fail to call api. run it locally.
-            #     self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
-            #     print("Tokenizer loaded")
-            #     self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
-            #     print("Local model loaded")
-                # response = litellm.completion(
-                #     model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
-                #     messages=[{"role": "system", "content": system_prompt},
-                #                 {"role": "user", "content": user_prompt}],
-                #     temperature=0.0,
-                #     max_tokens=1024,
-                #     api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
-                # )
-                # self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
-                # response = litellm.completion(
-                #             model="huggingface/" + self.model_id,
-                #             # mistralai/Mistral-7B-Instruct-v0.1",
-                #             messages=[{"role": "system", "content": system_prompt},
-                #                 {"role": "user", "content": user_prompt}],
-                #             #temperature=0.0,
-                #             max_tokens=1024,
-                #             api_base="https://api-inference.huggingface.co/models/" + self.model_id)
-                # print("模型返回结果",response)
-                # print("模型返回结果结束")
-                # # exit()
-                # result = response['choices'][0]['message']['content']
-                # print(result)
-                # exit()
-                # Using Google AI API for Gemini models
         elif 'gemini' in self.model_id.lower():
             genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
             generation_config = {
-                "temperature": 0,
-                "top_p": 0.95,  # cannot change
-                "top_k": 0,
                 "max_output_tokens": 100,
                 # "response_mime_type": "application/json",
             }
@@ -589,58 +492,28 @@ class SummaryGenerator:
         # Using local model
-    def _compute_avg_length(self):
-        """
-        Compute the average length of non-empty summaries using SpaCy.
-        """
-        total_word_count = 0
-        total_count = 0
-        for summary in self.summaries_df['summary']:
-            if util.is_summary_valid(summary):
-                doc = nlp1(summary)
-                words = [token.text for token in doc if token.is_alpha]
-                total_word_count += len(words)
-                total_count += 1
-        self.avg_length = 0 if total_count == 0 else total_word_count / total_count
-    def _compute_answer_rate(self):
-        """
-        Compute the rate of non-empty summaries.
-        """
-        valid_count = sum(1 for summary in self.summaries_df['summary']
-                            if util.is_summary_valid(summary))
-        total_count = len(self.summaries_df)
-        self.answer_rate = 0 if total_count == 0 else valid_count / total_count
 class EvaluationModel:
-    """A class to evaluate generated summaries.
     Attributes:
         model (CrossEncoder): The evaluation model.
-        scores (list): List of evaluation scores.
-        accuracy (float): Accuracy of the summaries.
-        hallucination_rate (float): Rate of hallucination in summaries.
     """
     def __init__(self, model_path):
         """
-        Initializes the EvaluationModel with a CrossEncoder model.
-        Args:
-            model_path (str): Path to the CrossEncoder model.
         """
-        self.model = load_evaluation_model(model_path)
         self.scores = []
-        self.factual_consistency_rate = None
-        self.hallucination_rate = None
         self.humanlike_score = None
-    def code_results(self, summaries_df):
         '''code results from LLM's response'''
         output = []
         '''database for Exp4'''
@@ -661,28 +534,27 @@ class EvaluationModel:
             Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
             Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
         male_keyword = ["he", "his", "himself"]
         female_keyword = ["she", "her", "herself"]
-        print(len(summaries_df["Experiment"]))
-        for i in range(len(summaries_df["Experiment"])):
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
-            if pd.isna(summaries_df["Response"][i]):
                 output.append("Other")
                 continue
-            rs = summaries_df["Response"][i].strip().lower()
-            sentences = rs.split('\n')
-            sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
-                         for sentence in sentences]
-            rs = [sentence.strip() for sentence in sentences if sentence.strip()]
-            rs = '\n'.join(rs)
-            rs = rs.replace("[", '').replace("]", '')
             '''Exp1'''
-            # period and comma will affect the result
-            if summaries_df["Experiment"][i] == "E1":
-                print("E1", rs)
-                rs = rs.replace('"','')
                 if rs == "round":
                     # vote_1_1 += 1
                     output.append("Round")
@@ -691,13 +563,12 @@ class EvaluationModel:
                 else:
                     output.append("Other")
                 '''Exp2'''
-                # not the first pronoun
-            elif summaries_df["Experiment"][i] == "E2":
-                # rs = summaries_df["Response"][i].strip()
                 rs = rs.split(' ')
-                print("E2", rs)
                 male, female = 0, 0
                 for word in rs:
                     if word in female_keyword and male == 0:
@@ -708,323 +579,63 @@ class EvaluationModel:
                         male = 1
                         output.append("Male")
                         break
-                if male == 0 and female == 0 :
                     output.append("Other")
                 '''Exp3'''
-                #
-            elif summaries_df["Experiment"][i] == "E3":
-                # rs = summaries_df["Response"][i].strip()
-                print("E3", rs)
-                if pd.isna(summaries_df["Factor 2"][i]):
-                    output.append("Other")
-                else:
-                    if summaries_df["Factor 2"][i].strip() == "LS":
-                        if "2" in rs:
-                            output.append("Long")
-                        elif "3" in rs:
-                            output.append("Short")
-                        else:
-                            output.append("Other")
-                    if summaries_df["Factor 2"][i].strip() == "SL":
-                        if "2" in rs:
-                            output.append("Short")
-                        elif "3" in rs:
-                            output.append("Long")
-                        else:
-                            output.append("Other")
-                '''Exp4'''
-            elif summaries_df["Experiment"][i] == "E4":
-                # rs = summaries_df["Response"][i].strip()
-                target = summaries_df["Factor 2"][i].strip().lower()
-                pair = target + "_" + rs
-                print("E4:", pair)
-                if pair in wordpair2code.keys():
-                    output.append(wordpair2code[pair])
-                else:
-                    output.append("Other")
-                '''Exp5'''
-            elif summaries_df["Experiment"][i] == "E5" or summaries_df["Experiment"][i] == "E51":
-                # sentence = summaries_df["Response"][i].strip()
-                item_id = summaries_df["Item"][i]
-                question_id = summaries_df["Question_ID"][i]
-                sti1, sti2 = "", ""
-                if summaries_df["Experiment"][i] == "E51":
-                    sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
-                    sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
-                    verb = item2verb1[item_id].lower()
-                    sentence = sti1 + " " + rs.replace(sti1, "")
-                    print("E5", verb, sentence)
-                if summaries_df["Experiment"][i] == "E5":
-                    sti1 = Stimuli1[question_id].lower().replace("...", "")
-                # print(sti1)
-                    sti2 = Stimuli2[question_id].lower().replace("...", "")
-                    verb = item2verb2[item_id].lower()
-                    sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
-                    print("E5", verb, sentence)
-                doc = nlp1(sentence.replace("  "," "))
-                # print(doc)
-                # print()
-                verb_token = None
-                for token in doc:
-                    # print(token.lemma_)
-                    if token.lemma_ == verb:
-                        verb_token = token
-                        break
-                # exit()
-                if verb_token is None:
                     output.append("Other")
-                    print("E5 The target verb is missing from the sentence.")
                 else:
-                    pobj, dative = None, None
-                    # print(verb_token.children)
-                    # exit()
-                    for child in verb_token.children:
-                        print(child)
-                        if (child.dep_ == 'dative' and child.pos_ == "ADP") or (child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
-                            pobj = child.text
-                        if child.dep_ == 'dative':
-                            dative = child.text
-                    print("E5", pobj, dative)
-                    # exit()
-                    if pobj:
-                        output.append("PO")
-                    elif dative:
-                        output.append("DO")
-                    else:
-                        print("Other", sentence, pobj, dative)
-                        # exit()
                         output.append("Other")
-                '''Exp6'''
-            elif summaries_df["Experiment"][i] == "E6":
-                sentence = summaries_df["Stimuli 1"][i].strip().lower()
-                print("E6", sentence)
-                doc = nlp1(sentence)
-                subject = "None"
-                obj = "None"
-                # 遍历依存关系，寻找主语和宾语
-                for token in doc:
-                    if token.dep_ == "nsubj":
-                        subject = token.text
-                    elif token.dep_ == "dobj":
-                        obj = token.text
-                print("E6", subject, obj)
-                if subject in rs and obj in rs:
-                    print(rs, subject, obj, "Other")
-                    output.append("Other")
-                elif subject in rs:
-                    print(rs, subject, obj, "VP")
-                    output.append("VP")
-                elif obj in rs:
-                    print(rs, subject, obj, "NP")
-                    output.append("NP")
-                else:
-                    print(rs, subject, obj, "Other")
-                    output.append("Other")
-                '''Exp7'''
-            elif summaries_df["Experiment"][i] == "E7":
-                # rs = summaries_df["Response"][i].strip().lower()
-                print("E7",rs)
-                if rs == "no":
-                    output.append("0")
-                elif rs == "yes":
-                    output.append("1")
-                else:
-                    output.append("Other")
-                '''Exp8'''
-            elif summaries_df["Experiment"][i] == "E8":
-                # rs = summaries_df["Response"][i].strip()
-                if "something is wrong with the question" in rs:
-                    output.append("1")
-                else:
-                    output.append("0")
-                '''Exp9'''
-            elif summaries_df["Experiment"][i] == "E9":
-                male, female = 0, 0
-                # rs = summaries_df["Response"][i].strip()
-                if "because" in rs:
-                    rs = rs.replace("because because","because").split("because")[1]
-                else:
-                    rs = rs
-                condition = summaries_df["Factor 2"][i].strip()
-                rs = rs.split(" ")
-                for w in rs:
-                    if w in male_keyword and female != 1:
-                        male = 1
-                        break
-                    if w in female_keyword and male != 1:
-                        female = 1
-                        break
-                print("E9", "condition", condition, "male", male, "female", female)
-                if  male == 0 and female == 0:
-                    output.append('Other')
-                else:
-                    if male == 1 and female==0:
-                        if condition == "MF":
-                            output.append("Subject")
-                        elif condition == "FM":
-                            output.append("Object")
-                        else:
-                            output.append("Other")
-                    elif female == 1 and male ==0:
-                        if condition == "MF":
-                            output.append("Object")
-                        elif condition == "FM":
-                            output.append("Subject")
                         else:
                             output.append("Other")
-                '''Exp10'''
-            elif summaries_df["Experiment"][i] == "E10":
-                # rs = summaries_df["Response"][i].strip()
-                if rs == "yes":
-                    output.append("1")
-                else:
-                    output.append("0")
-            else:
-                print("can;t find the Exp:", summaries_df["Experiment"][i])
-                output.append("NA")
-            # print(output)
-        # exit()
-        '''human'''
-        self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], summaries_df["Coding"], output)),
-                                            columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
-        # '''LLM'''
-        # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
-        #                                     columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Coding"])
-        print(self.data.head())
-        return self.data
-    def code_results_llm(self, summaries_df):
-        '''code results from LLM's response'''
-        output = []
-        '''database for Exp4'''
-        item4 = pd.read_csv(envs.ITEM_4_DATA)
-        wordpair2code = {}
-        for j in range(len(item4['Coding'])):
-            wordpair2code[item4['Pair'][j]] = item4['Coding'][j]
-        '''verb for Exp5'''
-        item5 = pd.read_csv(envs.ITEM_5_DATA)
-        # item corresponding to verb, same item id corresponding to verb pair
-        item2verb2 = {}
-        item2verb1 = {}
-        Stimuli1, Stimuli2 = {}, {}
-        for j in range(len(item5['Item'])):
-            item2verb1[item5['Item'][j]] = item5['Verb1'][j]
-            item2verb2[item5['Item'][j]] = item5['Verb2'][j]
-            Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
-            Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
-        male_keyword = ["he", "his", "himself"]
-        female_keyword = ["she", "her", "herself"]
-        print(len(summaries_df["Experiment"]))
-        for i in range(len(summaries_df["Experiment"])):
-            # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
-            # print()
-            # data cleaning
-            if pd.isna(summaries_df["Response"][i]):
-                output.append("Other")
-                continue
-            rs = summaries_df["Response"][i].strip().lower()
-            sentences = rs.split('\n')
-            sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
-                         for sentence in sentences]
-            rs = [sentence.strip() for sentence in sentences if sentence.strip()]
-            rs = '\n'.join(rs)
-            rs = rs.replace('[', '').replace(']','').replace('.','')
-            '''Exp1'''
-            # the period and comma will affect the result
-            if summaries_df["Experiment"][i] == "E1":
-                print("E1", rs)
-                rs = rs.replace('"', '')  # Remove any unnecessary quotation marks
-                rs_cleaned = rs.replace(',', '')  # Remove periods and commas
-                # Use 'contains' instead of 'equals' for keyword matching to avoid issues caused by punctuation
-                if "round" in rs_cleaned:
-                    output.append("Round")
-                elif "spiky" in rs_cleaned:
-                    output.append("Spiky")
-                else:
-                    output.append("Other")
-                '''Exp2'''
-            elif summaries_df["Experiment"][i] == "E2":
-                rs = rs.split(' ')
-                print("E2", rs)
-                male, female = 0, 0
-                for word in rs:
-                    if word in female_keyword and male == 0:
-                        female = 1
-                        output.append("Female")
-                        break
-                    if word in male_keyword and female == 0:
-                        male = 1
-                        output.append("Male")
-                        break
-                if male == 0 and female == 0 :
-                    output.append("Other")
-                '''Exp3'''
-            elif summaries_df["Experiment"][i] == "E3":
-                # rs = summaries_df["Response"][i].strip()
-                print("E3", rs)
-                rs = rs.replace('"', '').lower().replace(".","")
-                pair = summaries_df["Factor 2"][i]
-                word1, word2 = pair.split('_')
-                if rs == word1:
-                    if len(word1) > len(word2):
-                        output.append("Long")
-                    else:
-                        output.append("Short")
-                elif rs == word2:
-                    if len(word1) > len(word2):
-                        output.append("Short")
-                    else:
-                        output.append("Long")
-                else:
-                    output.append("Other")
                 '''Exp4'''
-            elif summaries_df["Experiment"][i] == "E4":
                 try:
                     meaning_word = rs.split(";")[4].replace(" ", '')
                 except IndexError:
-                    output.append("Other")
-                    continue
                 except Exception as e:
                     print(f"Unexpected error: {e}")
                     output.append("Other")
                     continue
-                meaning_word = meaning_word.replace('.', '')
-                meaning_word = meaning_word.replace(';', '')
-                target = summaries_df["Factor 2"][i].strip().lower()
                 pair = target + "_" + meaning_word
-                print("E4:", pair)
                 if pair in wordpair2code.keys():
                     output.append(wordpair2code[pair])
@@ -1032,31 +643,30 @@ class EvaluationModel:
                     output.append("Other")
                 '''Exp5'''
-            elif summaries_df["Experiment"][i] == "E5" or summaries_df["Experiment"][i] == "E51":
-                # sentence = summaries_df["Response"][i].strip()
-                item_id = summaries_df["Item"][i]
-                question_id = summaries_df["Question_ID"][i]
                 sti1, sti2 = "", ""
-                if summaries_df["Experiment"][i] == "E51":
                     sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
-                    sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
                     verb = item2verb1[item_id].lower()
                     sentence = sti1 + " " + rs.replace(sti1, "")
-                    print("E5", verb, sentence)
-                if summaries_df["Experiment"][i] == "E5":
-                    sti1 = Stimuli1[question_id].lower().replace("...", "")
-                # print(sti1)
                     sti2 = Stimuli2[question_id].lower().replace("...", "")
                     verb = item2verb2[item_id].lower()
-                    sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
-                    print("E5", verb, sentence)
-                doc = nlp1(sentence.replace("  "," "))
                 # print(doc)
                 # print()
                 verb_token = None
@@ -1066,102 +676,94 @@ class EvaluationModel:
                         verb_token = token
                         break
                 # exit()
-                if verb_token is None:
-                    output.append("Other")
-                    print("E5 The target verb is missing from the sentence.")
-                else:
-                    pobj, dative = None, None
-                    # print(verb_token.children)
-                    # exit()
                     for child in verb_token.children:
-                        print(child)
-                        if (child.dep_ == 'dative' and child.pos_ == "ADP") or (child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
                             pobj = child.text
                         if child.dep_ == 'dative':
                             dative = child.text
-                    print("E5", pobj, dative)
                     # exit()
-                    if pobj:
-                        output.append("PO")
-                    elif dative:
-                        output.append("DO")
-                    else:
-                        print("Other", sentence, pobj, dative)
-                        # exit()
-                        output.append("Other")
                 '''Exp6'''
-            elif summaries_df["Experiment"][i] == "E6":
-                sentence = summaries_df["Stimuli 1"][i].strip().lower()
-                print("E6", sentence)
                 doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
                 for token in doc:
                     if token.dep_ == "nsubj":
                         subject = token.text
                     elif token.dep_ == "dobj":
                         obj = token.text
-                print("E6", subject, obj)
                 if subject in rs and obj in rs:
-                    print(rs, subject, obj, "Other")
                     output.append("Other")
                 elif subject in rs:
-                    print(rs, subject, obj, "VP")
                     output.append("VP")
                 elif obj in rs:
-                    print(rs, subject, obj, "NP")
                     output.append("NP")
                 else:
-                    print(rs, subject, obj, "Other")
                     output.append("Other")
                 '''Exp7'''
-            elif summaries_df["Experiment"][i] == "E7":
-            # Remove periods and commas, then convert to lowercase
                 rs = rs.replace(".", "").replace(",", "").lower()
-                print("E7", rs)
-                # Split the response into words
-                words = rs.split(' ')
-                found = False
-                for word in words:
-                    if word == "no":
-                        output.append("0")
-                        found = True
-                        break
-                    elif word == "yes":
-                        output.append("1")
-                        found = True
-                        break
-                if not found:
                     output.append("Other")
                 '''Exp8'''
-            elif summaries_df["Experiment"][i] == "E8":
-                # rs = summaries_df["Response"][i].strip()
-                print("E8",rs)
                 if "something is wrong with the question" in rs:
                     output.append("1")
                 else:
                     output.append("0")
                 '''Exp9'''
-            elif summaries_df["Experiment"][i] == "E9":
                 male, female = 0, 0
-                # rs = summaries_df["Response"][i].strip()
                 if "because" in rs:
-                    rs = rs.replace("because because","because").split("because")[1]
                 else:
                     rs = rs
-                condition = summaries_df["Factor 2"][i].strip()
                 rs = rs.split(" ")
                 for w in rs:
                     if w in male_keyword and female != 1:
@@ -1170,18 +772,18 @@ class EvaluationModel:
                     if w in female_keyword and male != 1:
                         female = 1
                         break
-                print("E9", "condition", condition, "male", male, "female", female)
-                if  male == 0 and female == 0:
                     output.append('Other')
                 else:
-                    if male == 1 and female==0:
                         if condition == "MF":
                             output.append("Subject")
                         elif condition == "FM":
                             output.append("Object")
                         else:
                             output.append("Other")
-                    elif female == 1 and male ==0:
                         if condition == "MF":
                             output.append("Object")
                         elif condition == "FM":
@@ -1190,28 +792,28 @@ class EvaluationModel:
                             output.append("Other")
                 '''Exp10'''
-            elif summaries_df["Experiment"][i] == "E10":
-                # Remove periods from the response
-                rs = rs.replace(".", "").lower()  # Convert to lowercase to ensure case-insensitivity
-                print("E10", rs)
-                # Check if the response contains "yes"
-                if "yes" in rs:
                     output.append("1")
                 else:
                     output.append("0")
             else:
-                print("can’t find the Exp:", summaries_df["Experiment"][i])
                 output.append("NA")
             # print(output)
         # exit()
         '''human'''
-        # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], summaries_df["Coding"], output)),
         #                                     columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
         '''LLM'''
-        self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
-                                            columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Coding"])
-        print(self.data.head())
         return self.data
@@ -1332,55 +934,8 @@ class EvaluationModel:
         return all_results
-        # ### Calculate Average JS Divergence ###
-        #
-        # # Extract the relevant columns for JS divergence calculation
-        # human_responses = human_df[['Question_ID', 'Coding']]
-        # llm_responses = llm_df[['Question_ID', 'Coding']]
-        #
-        # # Get unique Question_IDs present in both datasets
-        # common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
-        #
-        # # Initialize a list to store JS divergence for each Question_ID
-        # js_divergence_list = []
-        # js_divergence ={}
-        #
-        # # Calculate JS divergence for each common Question_ID
-        # for q_id in common_question_ids:
-        #     # Get response distributions for the current Question_ID in both datasets
-        #     human_dist = human_responses[human_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
-        #     llm_dist = llm_responses[llm_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
-        #
-        #     # Reindex the distributions to have the same index, filling missing values with 0
-        #     all_responses = set(human_dist.index).union(set(llm_dist.index))
-        #     human_dist = human_dist.reindex(all_responses, fill_value=0)
-        #     llm_dist = llm_dist.reindex(all_responses, fill_value=0)
-        #
-        #     # Calculate JS divergence and add to the list
-        #     js_div = jensenshannon(human_dist, llm_dist, base=2)
-        #     experiment_id = q_id.split('_')[1]
-        #     if experiment_id not in js_divergence:
-        #         js_divergence[experiment_id] = []
-        #     js_divergence[experiment_id].append(js_div)
-        #
-        #     js_divergence_list.append(js_div)
-        #     #js_divergence[q_id] = js_div
-        #
-        #
-        #
-        # # Calculate the average JS divergence
-        # # JS per experiment
-        # avg_js_divergence_per_experiment = {exp: 1- np.nanmean(divs) for exp, divs in js_divergence.items()}
-        # print(avg_js_divergence_per_experiment)
-        #
-        # # JS overall
-        # avg_js_divergence = 1 - np.nanmean(js_divergence_list)
-        # print("avg_js_divergence:", avg_js_divergence)
-        #
-        # return avg_js_divergence
-    def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
         '''
         evaluate humanlike score
         1. code the result
@@ -1401,7 +956,7 @@ class EvaluationModel:
         '''coding llm data'''
         save_path = result_save_path.replace('.csv','_coding.csv')
-        self.llm_df = self.code_results_llm(summaries_df)
@@ -1412,7 +967,7 @@ class EvaluationModel:
             self.llm_df.to_csv(fpath)
         envs.API.upload_file(
-            path_or_fileobj= save_path,#./generation_results/meta-llama/Llama-2-13b-chat-hf_coding.csv
             path_in_repo=f"{save_path.replace('generation_results/','')}",#
             repo_id=envs.RESULTS_REPO,
             repo_type="dataset",
@@ -1426,111 +981,3 @@ class EvaluationModel:
-    def evaluate_hallucination(self, summaries_df):
-        """
-        Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
-        of the instance with the computed scores.
-        Args:
-            summaries_df (DataFrame): DataFrame containing source docs and summaries.
-        Returns:
-            list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
-        """
-        hem_scores = []
-        sources = []
-        summaries = []
-        source_summary_pairs = util.create_pairs(summaries_df)
-        '''评价模型结果'''
-        for doc, summary in tqdm(source_summary_pairs, desc="Evaluating Humanlikeness"):
-            if util.is_summary_valid(summary):
-                try:
-                    summary = summary.replace('<bos>','').replace('<eos>','')
-                    score = self.model.predict([doc, summary])# [0]
-                    if not isinstance(score, float):
-                        try:
-                            score = score.item()
-                        except:
-                            logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
-                            continue
-                    hem_scores.append(score)
-                    sources.append(doc)
-                    summaries.append(summary)
-                except Exception as e:
-                    logging.error(f"Error while running HEM: {e}")
-                    raise
-        self.scores = hem_scores
-        eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
-        return hem_scores, eval_results
-        # for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
-        #     if util.is_summary_valid(summary):
-        #         try:
-        #             # summary_pieces = summary.split('\n')
-        #             # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
-        #             summary = summary.replace('<bos>','').replace('<eos>','')
-        #             # print([doc, summary])
-        #             # print(self.model.predict([doc, summary]))
-        #             score = self.model.predict([doc, summary])# [0]
-        #             if not isinstance(score, float):
-        #                 try:
-        #                     score = score.item()
-        #                 except:
-        #                     logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
-        #                     continue
-        #             hem_scores.append(score)
-        #             sources.append(doc)
-        #             summaries.append(summary)
-        #         except Exception as e:
-        #             logging.error(f"Error while running HEM: {e}")
-        #             raise
-        # self.scores = hem_scores
-        # eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
-        # return hem_scores, eval_results
-    def compute_factual_consistency_rate(self, threshold=0.5):
-        """
-        Compute the factual consistency rate of the evaluated summaries based on
-        the previously calculated scores. This method relies on the 'scores'
-        attribute being populated, typically via the 'evaluate_hallucination' method.
-        Returns:
-            float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
-            and 'hallucination_rate' attributes of the instance.
-        Raises:
-            ValueError: If scores have not been calculated prior to calling this method.
-        """
-        if not self.scores:
-            error_msg = "Scores not calculated. Call evaluate_hallucination() first."
-            logging.error(error_msg)
-            raise ValueError(error_msg)
-        # Use threshold of 0.5 to compute factual_consistency_rate
-        num_above_threshold = sum(score >= threshold for score in self.scores)
-        num_total = len(self.scores)
-        if not num_total:
-            raise ValueError("No scores available to compute factual consistency rate.")
-        self.factual_consistency_rate = (num_above_threshold / num_total) * 100
-        self.hallucination_rate = 100 - self.factual_consistency_rate
-        return self.factual_consistency_rate

 import requests
 import json
+# import numpy as np
 import pandas as pd
 import spacy
 from sentence_transformers import CrossEncoder
 # Load spacy model for word tokenization
 # nlp = spacy.load("en_core_web_sm")
 try:
+    nlp1 = spacy.load("en_core_web_trf")
 except OSError:
     print("无法加载模型，继续执行其他处理。")
 class ModelLoadingException(Exception):
     """Exception raised for errors in loading a model.
         super().__init__(f"{messages} id={model_id} revision={revision}")
+class ResponseGenerator:
+    """A class to generate responses using a causal language model.
     Attributes:
         model (str): huggingface/{model_id}
         api_base (str): https://api-inference.huggingface.co/models/{model_id}
+        responses_df (DataFrame): DataFrame to store generated responses.
         revision (str): Model revision.
+        avg_length (float): Average length of responses.
+        answer_rate (float): Rate of non-empty responses.
     """
     def __init__(self, model_id, revision):
         """
+        Initializes the ResponseGenerator with a model.
         Args:
             model_id (str): Identifier for the model.
         self.model_id = model_id
         self.model = f"huggingface/{model_id}"
         self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
+        self.responses_df = pd.DataFrame()
         self.revision = revision
         self.avg_length = None
         self.answer_rate = None
         self.exceptions = None
         self.local_model = None
+    def generate_response(self, dataset, df_prompt, save_path=None):
+        """Generate responses for a given DataFrame of source docs.
         Args:
             dataset (DataFrame): DataFrame containing source docs.
         Returns:
+            responses_df (DataFrame): Generated responses by the model.
         """
         exceptions = []
         if (save_path is not None) and os.path.exists(save_path):
             '''已存在文件,可以读取已经存在的测试文本'''
+            self.responses_df = pd.read_csv(save_path)
+            # print(self.responses_df['Experiment'])
+            print(f'Loaded generated responses from {save_path}')
         else:
             '''测试文件不存在，则需要调用指定的模型来进行测试'''
             # prompt = {}
                         while True:
                             try:
                                 '''调用'''
+                                print(self.model_id.lower(),'-',ID,'-',j,'-',ii)
+                                _response = self.send_request(system_prompt, _user_prompt)
                                 # print(f"Finish index {index}")
                                 break
                             except Exception as e:
                                         print(f"Error at index {i}: {e}")
                                         time.sleep(wait_time)
                                         try:
+                                            _response = self.send_request(system_prompt, _user_prompt)
                                             break
                                         except Exception as ee:
                                             exceptions.append(ee)
                             break
                         if  i == 5:
                             #print(_response)
+                            # For E5, the responses might be in the following formats:
+                            # "Sure\n\nThe first sentence of the response\n\nThe second sentence of the response"
+                            # "The first sentence of the response\n\nThe second sentence of the response"
+                            # "XXX: The first sentence of the response\n\nXXX: The second sentence of the response"
+                            # "Sure\n\nXXX: The first sentence of the response\n\nXXX: The second sentence of the response"
+                            # "Sure\n\nThe first sentence of the response\n\nThe second sentence of the response\n\n"
                             def extract_responses(text, trigger_words=None):
                                 if trigger_words is None:
                                     trigger_words = ["sure", "okay", "yes"]
                                 try:
+                                    # Split the text into sentences
                                     sentences = text.split('\n')
+                                    # Remove empty sentences
                                     sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+                                    # Remove the first sentence if it has a : in it,
                                     sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
                                                  sentence in sentences]
+                                    # Remove empty sentences
+                                    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+                                    # Remove the first sentence if it is a trigger word
+                                    if any(sentences[0].lower().startswith(word) for word in trigger_words) and len(
+                                            sentences) > 2:
                                         _response1 = sentences[1].strip() if len(sentences) > 1 else None
                                         _response2 = sentences[2].strip() if len(sentences) > 2 else None
                                     else:
                                         _response1 = sentences[0].strip() if len(sentences) > 0 else None
                                         _response2 = sentences[1].strip() if len(sentences) > 1 else None
                                 except Exception as e:
                                     print(f"Error occurred: {e}")
                                     _response1, _response2 = None, None
+                                print(_response1), print(_response2)
                                 return _response1, _response2
                             _response1, _response2 = extract_responses(_response)
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
                                 Stimuli_1.append(Stimuli_1_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
                     # Sleep to prevent hitting rate limits too frequently
                         time.sleep(1)
+            self.responses_df = pd.DataFrame(list(zip(Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1)),
+                                             columns=["Experiment", "Question_ID", "Item", "Condition", "User_prompt", "Response","Factor 2","Stimuli 1"])
             if save_path is not None:
+                print(f'Save responses to {save_path}')
                 fpath = Path(save_path)
                 fpath.parent.mkdir(parents=True, exist_ok=True)
+                self.responses_df.to_csv(fpath)
         self.exceptions = exceptions
         # self._compute_avg_length()
         # self._compute_answer_rate()
+        return self.responses_df
+    def send_request(self, system_prompt: str, user_prompt: str):
         # Using Together AI API
         using_together_api = False
         together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
                 "model": self.model_id,
                 # "max_tokens": 4096,
                 'max_new_tokens': 100,
+                # "a": 0.0,
                 # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
             }
             payload['messages'] = [{"role": "system", "content": system_prompt},
                                         {"role": "user", "content": user_prompt}]
             headers = {
                     continue
             raise Exception("All tokens failed.")
         elif 'gemini' in self.model_id.lower():
             genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
             generation_config = {
+                # "temperature": 0,
+                # "top_p": 0.95,  # cannot change
+                # "top_k": 0,
                 "max_output_tokens": 100,
                 # "response_mime_type": "application/json",
             }
         # Using local model
 class EvaluationModel:
+    """A class to evaluate generated responses.
     Attributes:
         model (CrossEncoder): The evaluation model.
+        scores (list): List of scores for the responses.
+        humanlike_score (float): Human-likeness score
     """
     def __init__(self, model_path):
         """
+        Initializes the EvaluationModel.
         """
+        # self.model = load_evaluation_model(model_path)
         self.scores = []
         self.humanlike_score = None
+    def code_results_llm(self, responses_df):
         '''code results from LLM's response'''
         output = []
         '''database for Exp4'''
             Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
             Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
         male_keyword = ["he", "his", "himself"]
         female_keyword = ["she", "her", "herself"]
+        #print(len(responses_df["Experiment"]))
+        for i in range(len(responses_df["Experiment"])):
+            print(i, "/", len(responses_df["Experiment"]))
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
+            if pd.isna(responses_df["Response"][i]):
                 output.append("Other")
                 continue
+            rs = responses_df["Response"][i].strip().lower()
+            rs = rs.replace('"', '').replace("  ", " ").replace('.', '')
+            lines = rs.split("\n")
+            filtered_lines = [line for line in lines if line and not (line.endswith(":") or line.endswith("："))]
+            filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for
+                  r in filtered_lines]
+            rs = "\n".join(filtered_lines)
+            rs = rs.strip()
             '''Exp1'''
+            if responses_df["Experiment"][i] == "E1":
+                #print("E1", rs)
                 if rs == "round":
                     # vote_1_1 += 1
                     output.append("Round")
                 else:
                     output.append("Other")
                 '''Exp2'''
+            elif responses_df["Experiment"][i] == "E2":
+                # rs = responses_df["Response"][i].strip()
                 rs = rs.split(' ')
+                #print("E2", rs)
                 male, female = 0, 0
                 for word in rs:
                     if word in female_keyword and male == 0:
                         male = 1
                         output.append("Male")
                         break
+                if male == 0 and female == 0:
                     output.append("Other")
                 '''Exp3'''
+            elif responses_df["Experiment"][i] == "E3":
+                # rs = responses_df["Response"][i].strip()
+                #print("E3", rs)
+                pair = responses_df["Factor 2"][i]
+                word1, word2 = pair.replace(".", "").split('_')
+                if responses_df["Item"][i] == 12:
                     output.append("Other")
                 else:
+                    words = rs.split()  # split the response into words
+                    output = []
+                    if any(word == word1 for word in words) and any(word == word2 for word in words):
                         output.append("Other")
+                    else:
+                        if any(word.lower() == word1.lower() for word in words):
+                            if len(word1) > len(word2):
+                                output.append("Long")
+                            else:
+                                output.append("Short")
+                        elif any(word.lower() == word2.lower() for word in words):
+                            if len(word1) > len(word2):
+                                output.append("Short")
+                            else:
+                                output.append("Long")
                         else:
                             output.append("Other")
                 '''Exp4'''
+            elif responses_df["Experiment"][i] == "E4":
+                filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split("\n")]
+                filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
+                rs = "\n".join(filtered_lines)
+                filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split(";")]
+                filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
+                rs = ";".join(filtered_lines).strip()
                 try:
                     meaning_word = rs.split(";")[4].replace(" ", '')
                 except IndexError:
+                    try:
+                        meaning_word = rs.split("\n")[4].replace(" ", '')
+                    except IndexError:
+                        output.append("Other")
+                        continue
                 except Exception as e:
                     print(f"Unexpected error: {e}")
                     output.append("Other")
                     continue
+                target = responses_df["Factor 2"][i].strip().lower()
                 pair = target + "_" + meaning_word
+                #print("E4:", pair)
                 if pair in wordpair2code.keys():
                     output.append(wordpair2code[pair])
                     output.append("Other")
                 '''Exp5'''
+            elif responses_df["Experiment"][i] == "E5" or responses_df["Experiment"][i] == "E51":
+                # sentence = responses_df["Response"][i].strip()
+                item_id = responses_df["Item"][i]
+                question_id = responses_df["Question_ID"][i]
                 sti1, sti2 = "", ""
+                if responses_df["Experiment"][i] == "E51":
                     sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
+                    #sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
                     verb = item2verb1[item_id].lower()
                     sentence = sti1 + " " + rs.replace(sti1, "")
+                    #print("E5", verb, sentence)
+                if responses_df["Experiment"][i] == "E5":
+                    #sti1 = Stimuli1[question_id].lower().replace("...", "")
+                    # print(sti1)
                     sti2 = Stimuli2[question_id].lower().replace("...", "")
                     verb = item2verb2[item_id].lower()
+                    sentence = sti2 + " " + rs.replace(sti2, "")
+                    #print("E5", verb, sentence)
+                doc = nlp1(sentence.replace("  ", " "))
                 # print(doc)
                 # print()
                 verb_token = None
                         verb_token = token
                         break
                 # exit()
+                pobj, dative = None, None
+                # print(verb_token.children)
+                # exit()
+                if verb_token is not None:
                     for child in verb_token.children:
+                        # print(child)
+                        if (child.dep_ == 'dative' and child.pos_ == "ADP") or (
+                                child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
                             pobj = child.text
                         if child.dep_ == 'dative':
                             dative = child.text
+                # print("E5", pobj, dative)
+                # exit()
+                if pobj:
+                    output.append("PO")
+                elif dative:
+                    output.append("DO")
+                else:
+                    # print("Other", sentence, pobj, dative)
                     # exit()
+                    output.append("Other")
                 '''Exp6'''
+            elif responses_df["Experiment"][i] == "E6":
+                sentence = responses_df["Stimuli 1"][i].strip().lower()
+                #print("E6", sentence)
                 doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
                 for token in doc:
                     if token.dep_ == "nsubj":
                         subject = token.text
                     elif token.dep_ == "dobj":
                         obj = token.text
+                #print("E6", subject, obj)
                 if subject in rs and obj in rs:
+                    #print(rs, subject, obj, "Other")
                     output.append("Other")
                 elif subject in rs:
+                    #print(rs, subject, obj, "VP")
                     output.append("VP")
                 elif obj in rs:
+                    #print(rs, subject, obj, "NP")
                     output.append("NP")
                 else:
+                    #print(rs, subject, obj, "Other")
                     output.append("Other")
                 '''Exp7'''
+            elif responses_df["Experiment"][i] == "E7":
+                # rs = responses_df["Response"][i].strip().lower()
                 rs = rs.replace(".", "").replace(",", "").lower()
+                #print("E7", rs)
+                if "yes" in rs and "no" in rs:
+                    output.append("Other")
+                elif "no" in rs:
+                    output.append("0")
+                elif "yes" in rs:
+                    output.append("1")
+                else:
                     output.append("Other")
                 '''Exp8'''
+            elif responses_df["Experiment"][i] == "E8":
+                # rs = responses_df["Response"][i].strip()
+                #print("E8", rs)
                 if "something is wrong with the question" in rs:
                     output.append("1")
                 else:
                     output.append("0")
                 '''Exp9'''
+            elif responses_df["Experiment"][i] == "E9":
                 male, female = 0, 0
+                # rs = responses_df["Response"][i].strip()
                 if "because" in rs:
+                    rs = rs.replace("because because", "because").split("because")[1]
                 else:
                     rs = rs
+                condition = responses_df["Factor 2"][i].strip()
                 rs = rs.split(" ")
                 for w in rs:
                     if w in male_keyword and female != 1:
                     if w in female_keyword and male != 1:
                         female = 1
                         break
+                #print("E9", "condition", condition, "male", male, "female", female)
+                if male == 0 and female == 0:
                     output.append('Other')
                 else:
+                    if male == 1 and female == 0:
                         if condition == "MF":
                             output.append("Subject")
                         elif condition == "FM":
                             output.append("Object")
                         else:
                             output.append("Other")
+                    elif female == 1 and male == 0:
                         if condition == "MF":
                             output.append("Object")
                         elif condition == "FM":
                             output.append("Other")
                 '''Exp10'''
+            elif responses_df["Experiment"][i] == "E10":
+                # rs = responses_df["Response"][i].strip()
+                rs = rs.replace(".", "")
+                if rs == "yes":
                     output.append("1")
                 else:
                     output.append("0")
             else:
+                #print("can;t find the Exp:", responses_df["Experiment"][i])
                 output.append("NA")
             # print(output)
         # exit()
         '''human'''
+        # self.data = pd.DataFrame(list(zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"],  responses_df["Response"], responses_df["Factor 2"], responses_df["Stimuli 1"], responses_df["Coding"], output)),
         #                                     columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
         '''LLM'''
+        # print(len(output))
+        self.data = pd.DataFrame(list(
+            zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],
+                responses_df["Factor 2"], responses_df["Stimuli 1"], output)),
+            columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Stimuli 1",
+                     "Coding"])
         return self.data
         return all_results
+    def evaluate_humanlike(self, responses_df: pd.DataFrame, human_data_path: object, result_save_path: str) -> object:
         '''
         evaluate humanlike score
         1. code the result
         '''coding llm data'''
         save_path = result_save_path.replace('.csv','_coding.csv')
+        self.llm_df = self.code_results_llm(responses_df)
             self.llm_df.to_csv(fpath)
         envs.API.upload_file(
+            path_or_fileobj=save_path,#./generation_results/meta-llama/Llama-2-13b-chat-hf_coding.csv
             path_in_repo=f"{save_path.replace('generation_results/','')}",#
             repo_id=envs.RESULTS_REPO,
             repo_type="dataset",

src/backend/util.py CHANGED Viewed

@@ -1,81 +1,3 @@
-def is_summary_valid(summary: str) -> bool:
-    """
-    Checks if the summary is valid.
-    A summary is valid if it is not empty and contains at least five words.
-    Args:
-        summary (str): The summary to check.
-    Returns:
-        bool: True if the summary is valid, False otherwise.
-    """
-    if isinstance(summary, str):
-        words = summary.split()
-        if len(words) >= 5:
-            return True
-    # print(summary)
-    return False
-def create_pairs(df):
-    """
-    Creates pairs of source and summary from the dataframe.
-    Args:
-        df (DataFrame): The dataframe containing source and summary columns.
-    Returns:
-        list: A list of pairs [source, summary].
-    """
-    pairs = []
-    for _, row in df.iterrows():
-        pairs.append([row['source'], row['summary']])
-    return pairs
-# def format_results(model_name: str, revision: str, precision: str,
-#                 factual_consistency_rate: float, hallucination_rate: float,
-#                 answer_rate: float, avg_summary_len: float) -> dict:
-#     """
-#     Formats the evaluation results into a structured dictionary.
-#
-#     Args:
-#         model_name (str): The name of the evaluated model.
-#         revision (str): The revision hash of the model.
-#         precision (str): The precision with which the evaluation was run.
-#         factual_consistency_rate (float): The factual consistency rate.
-#         hallucination_rate (float): The hallucination rate.
-#         answer_rate (float): The answer rate.
-#         avg_summary_len (float): The average summary length.
-#
-#     Returns:
-#         dict: A dictionary containing the structured evaluation results.
-#     """
-#     results = {
-#         "config": {
-#             "model_dtype": precision, # Precision with which you ran the evaluation
-#             "model_name": model_name, # Name of the model
-#             "model_sha": revision # Hash of the model
-#         },
-#         "results": {
-#             "hallucination_rate": {
-#                 "hallucination_rate": round(hallucination_rate,3)
-#             },
-#             "factual_consistency_rate": {
-#                 "factual_consistency_rate": round(factual_consistency_rate,1)
-#             },
-#             "answer_rate": {
-#                 "answer_rate": round(answer_rate*100,1)
-#             },
-#             "average_summary_length": {
-#                 "average_summary_length": round(avg_summary_len,1)
-#             },
-#         }
-#     }
-#
-#     return results
 def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
     """
@@ -97,7 +19,7 @@ def format_results(model_name: str, revision: str, precision: str, overall_js: f
         "config": {
             "model_dtype": precision,  # Precision with which you ran the evaluation
             "model_name": model_name,  # Name of the model
-            "model_sha": revision      # Hash of the model
         },
         "results": {
             "overall_js_divergence": overall_js,          # Overall JS divergence

 def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
     """
         "config": {
             "model_dtype": precision,  # Precision with which you ran the evaluation
             "model_name": model_name,  # Name of the model
+            #"model_sha": revision      # Hash of the model
         },
         "results": {
             "overall_js_divergence": overall_js,          # Overall JS divergence

src/display/about.py CHANGED Viewed

@@ -33,10 +33,6 @@ class Tasks(Enum):
     E10 = Task("E10", "E10", "E10 Humanlike %")
     E10_ci = Task("E10_ci", "E10_ci", "E10 CI")
-    # factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
-    # answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
-    # average_summary_length = Task("average_summary_length",
-    #                             "average_summary_length", "Average Summary Length")
 # Your leaderboard name
@@ -59,18 +55,6 @@ To quantify the similarity, we collected responses from 2000 human participants,
 To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
 The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
-## Evaluation Dataset
-Our evaluation dataset consists of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
-We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
-## Metrics Explained
-- Individual Task Similarity: For each psycholinguistic task, we calculate the humanlike score for each stimulus, providing a measure of how closely the model’s responses resemble those of humans.
-- Average Similarity: The average of the humanlike scores across all stimuli and tasks, giving an overall indication of the model’s performance in mimicking human language use.
-## Note on non-Hugging Face models
-On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
-If you would like to submit your model that is not available on the Hugging Face model hub, please contact us at [email protected].
 ## Model Submissions and Reproducibility
 You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
@@ -101,27 +85,26 @@ After the evaluation, results are saved in "eval-results-bk/your_model_id/result
 ## Results Format
 The results are structured in JSON as follows:
 ```python
-{
-    "config": {
-        "model_dtype": "float16",
-        "model_name": "your_model_id",
-        "model_sha": "main"
-    },
-    "results": {
-        "hallucination_rate": {
-            "hallucination_rate": ...
-        },
-        "factual_consistency_rate": {
-            "factual_consistency_rate": ...
-        },
-        "answer_rate": {
-            "answer_rate": ...
-        },
-        "average_summary_length": {
-            "average_summary_length": ...
         }
-    }
-}
 ```
 For additional queries or model submissions, please contact [email protected].
 """

     E10 = Task("E10", "E10", "E10 Humanlike %")
     E10_ci = Task("E10_ci", "E10_ci", "E10 CI")
 # Your leaderboard name
 To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
 The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
 ## Model Submissions and Reproducibility
 You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
 ## Results Format
 The results are structured in JSON as follows:
 ```python
+        {
+          "config": {
+            "model_dtype": "BF16",
+            "model_name": "mistralai/Mistral-7B-Instruct-v0.1",
+            "model_sha": ""
+          },
+          "results": {
+            "overall_js_divergence": 0.6129438385008659,
+            "overall_confidence_interval": [
+              0.5937234777290732,
+              0.6317188731175192
+            ],
+            "E9": 0.7768461816966632,
+            "E9_ci": [
+              0.7474754730701578,
+              0.8058680968641126
+            ],
+            ...
+          }
         }
 ```
 For additional queries or model submissions, please contact [email protected].
 """

src/display/utils.py CHANGED Viewed

@@ -45,7 +45,7 @@ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub Licen
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])

 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])

src/envs.py CHANGED Viewed

@@ -53,7 +53,6 @@ API = HfApi(token=TOKEN)
 DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
 PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
-HEM_PATH = 'vectara/hallucination_evaluation_model'
 HUMAN_DATA = "./src/datasets/human_data_coding.csv" #experiment data
 ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
 ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
@@ -61,5 +60,4 @@ ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
 # SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
 SYSTEM_PROMPT = "You are participating in a psycholinguistic experiment. You will complete a task on English language use. Please respond to the questions directly, without using introductory phrases (e.g., Sure or OK) or special formats at the beginning of your responses."
 '''prompt'''
-# USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
 USER_PROMPT = ""

 DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
 PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
 HUMAN_DATA = "./src/datasets/human_data_coding.csv" #experiment data
 ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
 ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
 # SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
 SYSTEM_PROMPT = "You are participating in a psycholinguistic experiment. You will complete a task on English language use. Please respond to the questions directly, without using introductory phrases (e.g., Sure or OK) or special formats at the beginning of your responses."
 '''prompt'''
 USER_PROMPT = ""

src/leaderboard/read_evals.py CHANGED Viewed

@@ -85,16 +85,15 @@ class EvalResult:
                         if isinstance(v, (int, float)) and not math.isnan(v):
                             accs.append(np.around(v * 100, decimals=0))
                         elif isinstance(v, list):
-                            # 处理列表中的每个数值
                             accs.extend([np.around(x * 100, decimals=0) for x in v if
                                          isinstance(x, (int, float)) and not math.isnan(x)])
                         else:
                             # 跳过 NaN 或不符合条件的值
                             accs.append(None)
-                # 去掉 None 值（如果有）
                 accs = np.array([x for x in accs if x is not None])
-                # 过滤掉 None 值，确保 accs 只包含有效的数值
                 accs = accs[accs != None]
                 results[task.benchmark] = accs
@@ -168,7 +167,7 @@ class EvalResult:
             utils.AutoEvalColumn.architecture.name: self.architecture,
             utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
             utils.AutoEvalColumn.dummy.name: self.full_model,
-            utils.AutoEvalColumn.revision.name: self.revision,
             utils.AutoEvalColumn.license.name: self.license,
             utils.AutoEvalColumn.likes.name: self.likes,
             utils.AutoEvalColumn.params.name: self.num_params,

                         if isinstance(v, (int, float)) and not math.isnan(v):
                             accs.append(np.around(v * 100, decimals=0))
                         elif isinstance(v, list):
                             accs.extend([np.around(x * 100, decimals=0) for x in v if
                                          isinstance(x, (int, float)) and not math.isnan(x)])
                         else:
                             # 跳过 NaN 或不符合条件的值
                             accs.append(None)
                 accs = np.array([x for x in accs if x is not None])
                 accs = accs[accs != None]
                 results[task.benchmark] = accs
             utils.AutoEvalColumn.architecture.name: self.architecture,
             utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
             utils.AutoEvalColumn.dummy.name: self.full_model,
+            # utils.AutoEvalColumn.revision.name: self.revision,
             utils.AutoEvalColumn.license.name: self.license,
             utils.AutoEvalColumn.likes.name: self.likes,
             utils.AutoEvalColumn.params.name: self.num_params,

src/populate.py CHANGED Viewed

@@ -19,7 +19,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     print("all results:",df.to_string())
     # exit()
     try:
-        df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
         df = df[cols].round(decimals=2)
         # filter out if any of the benchmarks have not been produced
         df = df[formatting.has_no_nan_values(df, benchmark_cols)]

     print("all results:",df.to_string())
     # exit()
     try:
         df = df[cols].round(decimals=2)
         # filter out if any of the benchmarks have not been produced
         df = df[formatting.has_no_nan_values(df, benchmark_cols)]