Spaces:

XufengDuan
/

HumanLikeness

Sleeping

App Files Files Community

XufengDuan commited on Aug 18

Commit

ad27ecb

•

1 Parent(s): 86c17df

update scripts

Browse files

Files changed (7) hide show

app.py +3 -5
src/backend/model_operations.py +208 -180
src/display/about.py +7 -16
src/display/formatting.py +3 -1
src/envs.py +2 -6
src/leaderboard/read_evals.py +1 -13
src/populate.py +4 -4

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -9,7 +8,6 @@ from main_backend import PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED
 from src.backend import sort_queue
 from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
 import src.backend.manage_requests as manage_requests
 import socket
 import src.display.about as about
 from src.display.css_html_js import custom_css
@@ -21,12 +19,11 @@ import os
 import datetime
 import spacy_transformers
 import pprint
 pp = pprint.PrettyPrinter(width=80)
 TOKEN = os.environ.get("H4_TOKEN", None)
 print("TOKEN", TOKEN)
-import src.backend.run_eval_suite as run_eval_suite
 def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
     try:
@@ -45,7 +42,8 @@ def init_space():
         # sync model_type with open-llm-leaderboard
         ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
         ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
-    raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, utils.EVAL_COLS)
     return original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df

 import logging
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.backend import sort_queue
 from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
 import src.backend.manage_requests as manage_requests
 import socket
 import src.display.about as about
 from src.display.css_html_js import custom_css
 import datetime
 import spacy_transformers
 import pprint
+import src.backend.run_eval_suite as run_eval_suite
 pp = pprint.PrettyPrinter(width=80)
 TOKEN = os.environ.get("H4_TOKEN", None)
 print("TOKEN", TOKEN)
 def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
     try:
         # sync model_type with open-llm-leaderboard
         ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
         ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
+    original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, utils.EVAL_COLS)
     return original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df

src/backend/model_operations.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import time
 from datetime import datetime
 import logging
-from pathlib import Path
 import requests
 import json
@@ -135,12 +135,12 @@ class SummaryGenerator:
             # prompt = {}
             # for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
             #     prompt['E' + row['Item']] = row['Prompt']
-            xls = pd.ExcelFile(dataset)
             sheet_names = xls.sheet_names
             # sheet_names = df.sheetnames
-            print(f"Total: {len(sheet_names)}")
-            print(sheet_names)
             Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1 = [], [], [], [], [] ,[], [], []
             exit_outer_loop = False  # bad model
             for i, sheet_name in enumerate(sheet_names, start=1):
@@ -150,17 +150,17 @@ class SummaryGenerator:
                 # if i > 2 and i ==1:
                 #     continue
                 print(i, sheet_name)
-                df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
                 # 假设第一列是'Prompt0'，但这里我们使用列名来避免硬编码
-                if 'Prompt0' in df_sheet.columns:
-                    prompt_column = df_sheet['Prompt0']
-                else:
                     # 如果'Prompt0'列不存在，则跳过该工作表或进行其他处理
-                    continue
                 if i == 3 :
-                    word1_list = df_sheet['Stimuli-2']
-                    word2_list = df_sheet['Stimuli-3']
                     V2_column = []
                     for jj in range(len(word1_list)):
                         V2_column.append(word1_list[jj] + '_' + word2_list[jj])
@@ -175,17 +175,17 @@ class SummaryGenerator:
                 Item_column = df_sheet["Item"]
                 Condition_column = df_sheet["Condition"]
                 Stimuli_1_column = df_sheet["Stimuli-1"]
-                if 'Stimuli-2' in df_sheet.columns:
                     Stimuli_2_column = df_sheet["Stimuli-2"]
                 for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=0):
                     if exit_outer_loop:
                         break
-                    ID = 'E' + str(i)
                     # q_ID = ID + '_' + str(j)
                     # print(ID, q_ID, prompt_value)
-                    system_prompt = envs.SYSTEM_PROMPT
                     _user_prompt = prompt_value
                     for ii in range(10):
                     # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
@@ -193,7 +193,7 @@ class SummaryGenerator:
                             try:
                                 '''调用'''
                                 print(ID,'-',ii)
                                 _response = self.generate_summary(system_prompt, _user_prompt)
                                 # print(f"Finish index {index}")
                                 break
@@ -212,17 +212,24 @@ class SummaryGenerator:
                                     print(f"Quota has reached, wait for {wait_time}")
                                     time.sleep(wait_time)
                                 else:
-                                    print(f"Error at index {i}: {e}")
-                                    wait_time = 3600
-                                    time.sleep(wait_time)
-                                    try:
-                                        _response = self.generate_summary(system_prompt, _user_prompt)
-                                        break
-                                    except Exception as e:
-                                        exceptions.append(e)
                                         print(f"Error at index {i}: {e}")
-                                        exit_outer_loop = True  # 设置标志变量为True，准备退出最外层循环
-                                        break  # 跳出当前的 while 循环
                         if exit_outer_loop:
                             break
@@ -272,9 +279,9 @@ class SummaryGenerator:
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
                             User_prompt.append(_user_prompt)
                             Response.append(_response2)
                             Factor_2.append(V2_column[j])
                             Stimuli_1.append(Stimuli_2_column[j])
                             Item_ID.append(Item_column[j])
@@ -286,18 +293,18 @@ class SummaryGenerator:
                             User_prompt.append(_user_prompt)
                             Response.append(_response1)
                             Factor_2.append(V2_column[j])
                             Stimuli_1.append(Stimuli_1_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
                         else:
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
                             User_prompt.append(_user_prompt)
                             Response.append(_response)
                             if i == 6:
                                 Factor_2.append(Condition_column[j])
@@ -309,7 +316,7 @@ class SummaryGenerator:
                             Condition.append(Condition_column[j])
                             print(_response)
                         # exit()
                     # Sleep to prevent hitting rate limits too frequently
@@ -322,14 +329,14 @@ class SummaryGenerator:
                 print(f'Save summaries to {save_path}')
                 fpath = Path(save_path)
                 fpath.parent.mkdir(parents=True, exist_ok=True)
-                self.summaries_df.to_csv(fpath)
         self.exceptions = exceptions
         # self._compute_avg_length()
         # self._compute_answer_rate()
         return self.summaries_df
     def generate_summary(self, system_prompt: str, user_prompt: str):
         # Using Together AI API
         using_together_api = False
@@ -388,28 +395,115 @@ class SummaryGenerator:
                 result = ''
             print(result)
             return result
-        # Using OpenAI API
-        elif 'gpt' in self.model_id.lower():
-            response = litellm.completion(
-                model=self.model_id.replace('openai/',''),
-                messages=[{"role": "system", "content": system_prompt},
-                        {"role": "user", "content": user_prompt}],
-                # temperature=0.0,
-                max_tokens=50,
-                api_key = os.getenv('OpenAI_key')
-            )
-            result = response['choices'][0]['message']['content']
-            # print()
-            print(result)
             return result
-        # Using Google AI API for Gemini models
         elif 'gemini' in self.model_id.lower():
             genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
             generation_config = {
                 "temperature": 0,
-                "top_p": 0.95, # cannot change
                 "top_k": 0,
                 "max_output_tokens": 50,
                 # "response_mime_type": "application/json",
@@ -432,101 +526,35 @@ class SummaryGenerator:
                     "threshold": "BLOCK_NONE"
                 },
             ]
-            model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else self.model_id.lower().split('google/')[-1],
-                              generation_config=generation_config,
-                              system_instruction=system_prompt,
-                              safety_settings=safety_settings)
             convo = model.start_chat(history=[])
             convo.send_message(user_prompt)
             # print(convo.last)
             result = convo.last.text
             print(result)
             return result
-        # Using HF API or download checkpoints
-        elif self.local_model is None:
-#             print(self.model_id)
-#             print(self.api_base)
-#             mistralai/Mistral-7B-Instruct-v0.1
-# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
-            try: # try use HuggingFace API
-                # response = litellm.completion(
-                #     model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
-                #     messages=[{"role": "system", "content": system_prompt},
-                #                 {"role": "user", "content": user_prompt}],
-                #     temperature=0.0,
-                #     max_tokens=1024,
-                #     api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
-                # )
-                # self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
-                # response = litellm.completion(
-                #             model="huggingface/" + self.model_id,
-                #             # mistralai/Mistral-7B-Instruct-v0.1",
-                #             messages=[{"role": "system", "content": system_prompt},
-                #                 {"role": "user", "content": user_prompt}],
-                #             #temperature=0.0,
-                #             max_tokens=1024,
-                #             api_base="https://api-inference.huggingface.co/models/" + self.model_id)
-                # print("模型返回结果",response)
-                # print("模型返回结果结束")
-                # # exit()
-                # result = response['choices'][0]['message']['content']
-                # print(result)
-                from huggingface_hub import InferenceClient
-                print("token_for_request:",envs.TOKEN)
-                print(self.model_id)
-                client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
-                messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
-                # outputs = client.chat_completion(messages, max_tokens=50)
-                result = None
-                while result is None:
-                    outputs = client.chat_completion(messages, max_tokens=50)
-                    result = outputs['choices'][0]['message']['content']
-                    if result is None:
-                        time.sleep(1)  # Optional: Add a small delay before retrying
-                return result
-                # exit()
-            except: # fail to call api. run it locally.
-                self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
-                print("Tokenizer loaded")
-                self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
-                print("Local model loaded")
         # exit()
         # Using local model
-        if self.local_model: # cannot call API. using local model
-            messages=[
-                {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
-                {"role": "user", "content": user_prompt}
-            ]
-            try: # some models support pipeline
-                pipe = pipeline(
-                    "text-generation",
-                    model=self.local_model,
-                    tokenizer=self.tokenizer,
-                )
-                generation_args = {
-                    "max_new_tokens": 50,
-                    "return_full_text": False,
-                    #"temperature": 0.0,
-                    "do_sample": False,
-                }
-                output = pipe(messages, **generation_args)
-                result = output[0]['generated_text']
-                print(result)
-            except:
-                prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
-                print(prompt)
-                input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
-                with torch.no_grad():
-                    outputs = self.local_model.generate(**input_ids, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
-                result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                result = result.replace(prompt[0], '')
-                print(result)
-            return result
     def _compute_avg_length(self):
         """
@@ -607,7 +635,7 @@ class EvaluationModel:
         for i in range(len(summaries_df["Experiment"])):
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
-            if pd.isna(summaries_df["Response"][i]):
                 output.append("Other")
                 continue
             rs = summaries_df["Response"][i].strip().lower()
@@ -627,7 +655,7 @@ class EvaluationModel:
                     output.append("Spiky")
                 else:
                     output.append("Other")
                 '''Exp2'''
@@ -647,12 +675,12 @@ class EvaluationModel:
                         break
                 if male == 0 and female == 0 :
                     output.append("Other")
                 '''Exp3'''
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
-                if pd.isna(summaries_df["Factor 2"][i]):
                     output.append("Other")
                 else:
                     if summaries_df["Factor 2"][i].strip() == "LS":
@@ -668,9 +696,9 @@ class EvaluationModel:
                         elif "3" in rs:
                             output.append("Long")
                         else:
-                            output.append("Other")
                 '''Exp4'''
             elif summaries_df["Experiment"][i] == "E4":
                 # rs = summaries_df["Response"][i].strip()
                 target = summaries_df["Factor 2"][i].strip().lower()
@@ -704,8 +732,8 @@ class EvaluationModel:
                     verb = item2verb2[item_id].lower()
                     sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
                     print("E5", verb, sentence)
                 doc = nlp1(sentence.replace("  "," "))
                 # print(doc)
                 # print()
@@ -745,8 +773,8 @@ class EvaluationModel:
             elif summaries_df["Experiment"][i] == "E6":
                 sentence = summaries_df["Stimuli 1"][i].strip().lower()
-                print("E6", sentence)
-                doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
                 # 遍历依存关系，寻找主语和宾语
@@ -767,9 +795,9 @@ class EvaluationModel:
                     output.append("NP")
                 else:
                     print(rs, subject, obj, "Other")
-                    output.append("Other")
                 '''Exp7'''
@@ -786,7 +814,7 @@ class EvaluationModel:
                 '''Exp8'''
             elif summaries_df["Experiment"][i] == "E8":
                 # rs = summaries_df["Response"][i].strip()
                 if "something is wrong with the question" in rs:
                     output.append("1")
                 else:
@@ -795,7 +823,7 @@ class EvaluationModel:
                 '''Exp9'''
             elif summaries_df["Experiment"][i] == "E9":
                 male, female = 0, 0
                 # rs = summaries_df["Response"][i].strip()
                 if "because" in rs:
                     rs = rs.replace("because because","because").split("because")[1]
@@ -847,8 +875,8 @@ class EvaluationModel:
         # '''LLM'''
         # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
         #                                     columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Coding"])
-        print(self.data.head())
         return self.data
     def code_results_llm(self, summaries_df):
         '''code results from LLM's response'''
@@ -878,7 +906,7 @@ class EvaluationModel:
         for i in range(len(summaries_df["Experiment"])):
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
-            if pd.isna(summaries_df["Response"][i]):
                 output.append("Other")
                 continue
             rs = summaries_df["Response"][i].strip().lower()
@@ -893,7 +921,7 @@ class EvaluationModel:
                     output.append("Spiky")
                 else:
                     output.append("Other")
                 '''Exp2'''
@@ -913,13 +941,13 @@ class EvaluationModel:
                         break
                 if male == 0 and female == 0 :
                     output.append("Other")
                 '''Exp3'''
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
                 rs = rs.replace('"', '')
-                pair = summaries_df["Factor 2"][i]
                 word1, word2 = pair.split('_')
                 if rs == word1:
@@ -980,8 +1008,8 @@ class EvaluationModel:
                     verb = item2verb2[item_id].lower()
                     sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
                     print("E5", verb, sentence)
                 doc = nlp1(sentence.replace("  "," "))
                 # print(doc)
                 # print()
@@ -1021,8 +1049,8 @@ class EvaluationModel:
             elif summaries_df["Experiment"][i] == "E6":
                 sentence = summaries_df["Stimuli 1"][i].strip().lower()
-                print("E6", sentence)
-                doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
                 # 遍历依存关系，寻找主语和宾语
@@ -1043,9 +1071,9 @@ class EvaluationModel:
                     output.append("NP")
                 else:
                     print(rs, subject, obj, "Other")
-                    output.append("Other")
                 '''Exp7'''
@@ -1072,7 +1100,7 @@ class EvaluationModel:
                 '''Exp9'''
             elif summaries_df["Experiment"][i] == "E9":
                 male, female = 0, 0
                 # rs = summaries_df["Response"][i].strip()
                 if "because" in rs:
                     rs = rs.replace("because because","because").split("because")[1]
@@ -1125,14 +1153,14 @@ class EvaluationModel:
         '''LLM'''
         self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
                                             columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Coding"])
-        print(self.data.head())
         return self.data
     def calculate_js_divergence(self, file_path_1, file_path_2):
@@ -1225,7 +1253,7 @@ class EvaluationModel:
         print("avg_js_divergence:", avg_js_divergence)
         return avg_js_divergence
     def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
         '''
@@ -1272,19 +1300,19 @@ class EvaluationModel:

 import time
 from datetime import datetime
 import logging
+from pathlib import Path
 import requests
 import json
             # prompt = {}
             # for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
             #     prompt['E' + row['Item']] = row['Prompt']
+            xls = pd.ExcelFile(dataset)
             sheet_names = xls.sheet_names
             # sheet_names = df.sheetnames
+            print(f"Total: {len(sheet_names)}")
+            print(sheet_names)
             Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1 = [], [], [], [], [] ,[], [], []
             exit_outer_loop = False  # bad model
             for i, sheet_name in enumerate(sheet_names, start=1):
                 # if i > 2 and i ==1:
                 #     continue
                 print(i, sheet_name)
+                df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
                 # 假设第一列是'Prompt0'，但这里我们使用列名来避免硬编码
+                if 'Prompt0' in df_sheet.columns:
+                    prompt_column = df_sheet['Prompt0']
+                else:
                     # 如果'Prompt0'列不存在，则跳过该工作表或进行其他处理
+                    continue
                 if i == 3 :
+                    word1_list = df_sheet['Stimuli-2']
+                    word2_list = df_sheet['Stimuli-3']
                     V2_column = []
                     for jj in range(len(word1_list)):
                         V2_column.append(word1_list[jj] + '_' + word2_list[jj])
                 Item_column = df_sheet["Item"]
                 Condition_column = df_sheet["Condition"]
                 Stimuli_1_column = df_sheet["Stimuli-1"]
+                if 'Stimuli-2' in df_sheet.columns:
                     Stimuli_2_column = df_sheet["Stimuli-2"]
                 for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=0):
                     if exit_outer_loop:
                         break
+                    ID = 'E' + str(i)
                     # q_ID = ID + '_' + str(j)
                     # print(ID, q_ID, prompt_value)
+                    system_prompt = envs.SYSTEM_PROMPT
                     _user_prompt = prompt_value
                     for ii in range(10):
                     # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
                             try:
                                 '''调用'''
                                 print(ID,'-',ii)
                                 _response = self.generate_summary(system_prompt, _user_prompt)
                                 # print(f"Finish index {index}")
                                 break
                                     print(f"Quota has reached, wait for {wait_time}")
                                     time.sleep(wait_time)
                                 else:
+                                    max_retries = 30
+                                    retries = 0
+                                    wait_time = 120
+                                    while retries < max_retries:
                                         print(f"Error at index {i}: {e}")
+                                        time.sleep(wait_time)
+                                        try:
+                                            _response = self.generate_summary(system_prompt, _user_prompt)
+                                            break
+                                        except Exception as e:
+                                            exceptions.append(e)
+                                            retries += 1
+                                            print(f"Retry {retries}/{max_retries} failed at index {i}: {e}")
+                                            if retries >= max_retries:
+                                                exit_outer_loop = True
+                                                break
                         if exit_outer_loop:
                             break
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
                             User_prompt.append(_user_prompt)
                             Response.append(_response2)
                             Factor_2.append(V2_column[j])
                             Stimuli_1.append(Stimuli_2_column[j])
                             Item_ID.append(Item_column[j])
                             User_prompt.append(_user_prompt)
                             Response.append(_response1)
                             Factor_2.append(V2_column[j])
                             Stimuli_1.append(Stimuli_1_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
                         else:
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
                             User_prompt.append(_user_prompt)
                             Response.append(_response)
                             if i == 6:
                                 Factor_2.append(Condition_column[j])
                             Condition.append(Condition_column[j])
                             print(_response)
                         # exit()
                     # Sleep to prevent hitting rate limits too frequently
                 print(f'Save summaries to {save_path}')
                 fpath = Path(save_path)
                 fpath.parent.mkdir(parents=True, exist_ok=True)
+                self.summaries_df.to_csv(fpath)
         self.exceptions = exceptions
         # self._compute_avg_length()
         # self._compute_answer_rate()
         return self.summaries_df
     def generate_summary(self, system_prompt: str, user_prompt: str):
         # Using Together AI API
         using_together_api = False
                 result = ''
             print(result)
             return result
+        if self.local_model: # cannot call API. using local model
+            messages=[
+                {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
+                {"role": "user", "content": user_prompt}
+            ]
+            try: # some models support pipeline
+                pipe = pipeline(
+                    "text-generation",
+                    model=self.local_model,
+                    tokenizer=self.tokenizer,
+                )
+                generation_args = {
+                    "max_new_tokens": 50,
+                    "return_full_text": False,
+                    #"temperature": 0.0,
+                    "do_sample": False,
+                }
+                output = pipe(messages, **generation_args)
+                result = output[0]['generated_text']
+                print(result)
+            except:
+                prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
+                print(prompt)
+                input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
+                with torch.no_grad():
+                    outputs = self.local_model.generate(**input_ids, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
+                result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                result = result.replace(prompt[0], '')
+                print(result)
             return result
+        elif self.local_model is None:
+#             print(self.model_id)
+#             print(self.api_base)
+#             mistralai/Mistral-7B-Instruct-v0.1
+# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
+            # Using HF API or download checkpoints
+            try: # try use HuggingFace API
+                from huggingface_hub import InferenceClient
+                print("token_for_request:",envs.TOKEN)
+                print(self.model_id)
+                client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
+                messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
+                # outputs = client.chat_completion(messages, max_tokens=50)
+                result = None
+                while result is None:
+                    outputs = client.chat_completion(messages, max_tokens=50)
+                    result = outputs['choices'][0]['message']['content']
+                    if result is None:
+                        time.sleep(1)  # Optional: Add a small delay before retrying
+                return result
+            except Exception as e:
+                print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
+                try:
+                    client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
+                    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+                    result = None
+                    while result is None:
+                        outputs = client.chat_completion(messages, max_tokens=50)
+                        result = outputs['choices'][0]['message']['content']
+                        if result is None:
+                            time.sleep(1)  # Optional: Add a small delay before retrying
+                    return result
+                except Exception as e:
+                    print(f"Error with TOKEN1: {envs.TOKEN1}")
+                    raise e
+            # except: # fail to call api. run it locally.
+            #     self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
+            #     print("Tokenizer loaded")
+            #     self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
+            #     print("Local model loaded")
+                # response = litellm.completion(
+                #     model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
+                #     messages=[{"role": "system", "content": system_prompt},
+                #                 {"role": "user", "content": user_prompt}],
+                #     temperature=0.0,
+                #     max_tokens=1024,
+                #     api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
+                # )
+                # self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
+                # response = litellm.completion(
+                #             model="huggingface/" + self.model_id,
+                #             # mistralai/Mistral-7B-Instruct-v0.1",
+                #             messages=[{"role": "system", "content": system_prompt},
+                #                 {"role": "user", "content": user_prompt}],
+                #             #temperature=0.0,
+                #             max_tokens=1024,
+                #             api_base="https://api-inference.huggingface.co/models/" + self.model_id)
+                # print("模型返回结果",response)
+                # print("模型返回结果结束")
+                # # exit()
+                # result = response['choices'][0]['message']['content']
+                # print(result)
+                # exit()
+                # Using Google AI API for Gemini models
         elif 'gemini' in self.model_id.lower():
             genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
             generation_config = {
                 "temperature": 0,
+                "top_p": 0.95,  # cannot change
                 "top_k": 0,
                 "max_output_tokens": 50,
                 # "response_mime_type": "application/json",
                     "threshold": "BLOCK_NONE"
                 },
             ]
+            model = genai.GenerativeModel(
+                model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else
+                self.model_id.lower().split('google/')[-1],
+                generation_config=generation_config,
+                system_instruction=system_prompt,
+                safety_settings=safety_settings)
             convo = model.start_chat(history=[])
             convo.send_message(user_prompt)
             # print(convo.last)
             result = convo.last.text
             print(result)
             return result
+        # Using OpenAI API
+        elif 'gpt' in self.model_id.lower():
+            response = litellm.completion(
+                model=self.model_id.replace('openai/',''),
+                messages=[{"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt}],
+                # temperature=0.0,
+                max_tokens=50,
+                api_key = os.getenv('OpenAI_key')
+            )
+            result = response['choices'][0]['message']['content']
+            # print()
+            print(result)
+            return result
         # exit()
         # Using local model
     def _compute_avg_length(self):
         """
         for i in range(len(summaries_df["Experiment"])):
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
+            if pd.isna(summaries_df["Response"][i]):
                 output.append("Other")
                 continue
             rs = summaries_df["Response"][i].strip().lower()
                     output.append("Spiky")
                 else:
                     output.append("Other")
                 '''Exp2'''
                         break
                 if male == 0 and female == 0 :
                     output.append("Other")
                 '''Exp3'''
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
+                if pd.isna(summaries_df["Factor 2"][i]):
                     output.append("Other")
                 else:
                     if summaries_df["Factor 2"][i].strip() == "LS":
                         elif "3" in rs:
                             output.append("Long")
                         else:
+                            output.append("Other")
                 '''Exp4'''
             elif summaries_df["Experiment"][i] == "E4":
                 # rs = summaries_df["Response"][i].strip()
                 target = summaries_df["Factor 2"][i].strip().lower()
                     verb = item2verb2[item_id].lower()
                     sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
                     print("E5", verb, sentence)
                 doc = nlp1(sentence.replace("  "," "))
                 # print(doc)
                 # print()
             elif summaries_df["Experiment"][i] == "E6":
                 sentence = summaries_df["Stimuli 1"][i].strip().lower()
+                print("E6", sentence)
+                doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
                 # 遍历依存关系，寻找主语和宾语
                     output.append("NP")
                 else:
                     print(rs, subject, obj, "Other")
+                    output.append("Other")
                 '''Exp7'''
                 '''Exp8'''
             elif summaries_df["Experiment"][i] == "E8":
                 # rs = summaries_df["Response"][i].strip()
                 if "something is wrong with the question" in rs:
                     output.append("1")
                 else:
                 '''Exp9'''
             elif summaries_df["Experiment"][i] == "E9":
                 male, female = 0, 0
                 # rs = summaries_df["Response"][i].strip()
                 if "because" in rs:
                     rs = rs.replace("because because","because").split("because")[1]
         # '''LLM'''
         # self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
         #                                     columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Coding"])
+        print(self.data.head())
         return self.data
     def code_results_llm(self, summaries_df):
         '''code results from LLM's response'''
         for i in range(len(summaries_df["Experiment"])):
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
+            if pd.isna(summaries_df["Response"][i]):
                 output.append("Other")
                 continue
             rs = summaries_df["Response"][i].strip().lower()
                     output.append("Spiky")
                 else:
                     output.append("Other")
                 '''Exp2'''
                         break
                 if male == 0 and female == 0 :
                     output.append("Other")
                 '''Exp3'''
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
                 rs = rs.replace('"', '')
+                pair = summaries_df["Factor 2"][i]
                 word1, word2 = pair.split('_')
                 if rs == word1:
                     verb = item2verb2[item_id].lower()
                     sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
                     print("E5", verb, sentence)
                 doc = nlp1(sentence.replace("  "," "))
                 # print(doc)
                 # print()
             elif summaries_df["Experiment"][i] == "E6":
                 sentence = summaries_df["Stimuli 1"][i].strip().lower()
+                print("E6", sentence)
+                doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
                 # 遍历依存关系，寻找主语和宾语
                     output.append("NP")
                 else:
                     print(rs, subject, obj, "Other")
+                    output.append("Other")
                 '''Exp7'''
                 '''Exp9'''
             elif summaries_df["Experiment"][i] == "E9":
                 male, female = 0, 0
                 # rs = summaries_df["Response"][i].strip()
                 if "because" in rs:
                     rs = rs.replace("because because","because").split("because")[1]
         '''LLM'''
         self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"],  summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
                                             columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Coding"])
+        print(self.data.head())
         return self.data
     def calculate_js_divergence(self, file_path_1, file_path_2):
         print("avg_js_divergence:", avg_js_divergence)
         return avg_js_divergence
     def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
         '''

src/display/about.py CHANGED Viewed

@@ -33,15 +33,13 @@ An improved version (HHEM v2) is integrated into the [Vectara platform](https://
 LLM_BENCHMARKS_TEXT = """
 ## Introduction
-The Hughes Hallucination Evaluation Model (HHEM) Leaderboard is dedicated to assessing the frequency of hallucinations in document summaries generated by Large Language Models (LLMs).
-Hallucinations refer to instances where a model introduces factually incorrect or unrelated content in its summaries.
 ## How it works
-Using [Vectara](https://vectara.com)'s HHEM, we measure the occurrence of hallucinations in generated summaries.
-Given a source document and a summary generated by an LLM, HHEM outputs a hallucination score between 0 and 1, with 0 indicating complete hallucination and 1 representing perfect factual consistency.
-The model card for HHEM can be found [here](https://huggingface.co/vectara/hallucination_evaluation_model).
 ## Evaluation Dataset
@@ -49,10 +47,8 @@ Our evaluation dataset consists of 1006 documents from multiple public datasets,
 We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
 ## Metrics Explained
-- Hallucination Rate: Percentage of summaries with a hallucination score below 0.5
-- Factual Consistency Rate: The complement of the hallucination rate, expressed as a percentage.
-- Answer Rate: Percentage of summaries that are non-empty. This is either the model refuses to generate a response or throws an error due to various reasons. (e.g. the model believes that the document includes inappropriate content)
-- Average Summary Length: The average word count of generated summaries
 ## Note on non-Hugging Face models
 On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
@@ -61,18 +57,13 @@ If you would like to submit your model that is not available on the Hugging Face
 ## Model Submissions and Reproducibility
 You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
-### For models not available on the Hugging Face model hub:
-1) Access generated summaries used for evaluation [here](https://github.com/vectara/hallucination-leaderboard) in "leaderboard_summaries.csv".
-2) The text generation prompt is available under "Prompt Used" section in the repository's README.
-3) Details on API Integration for evaluations are under "API Integration Details".
 ### For models available on the Hugging Face model hub:
 To replicate the evaluation result for a Hugging Face model:
 1) Clone the Repository
 ```python
 git lfs install
-git clone https://huggingface.co/spaces/vectara/leaderboard
 ```
 2) Install the Requirements
 ```python

 LLM_BENCHMARKS_TEXT = """
 ## Introduction
+This study aims to compare the similarities between human and model responses in language use by employing ten psycholinguistic tasks. Each task consists of multiple stimuli, with each stimulus having both expected and unexpected responses.
+To quantify the similarity, we collected responses from 2000 human participants, creating a binomial distribution for each stimulus within each task. The same stimuli were then presented to a language model, generating another binomial distribution for comparison.
 ## How it works
+To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
+The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
 ## Evaluation Dataset
 We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
 ## Metrics Explained
+- Individual Task Similarity: For each psycholinguistic task, we calculate the humanlike score for each stimulus, providing a measure of how closely the model’s responses resemble those of humans.
+- Average Similarity: The average of the humanlike scores across all stimuli and tasks, giving an overall indication of the model’s performance in mimicking human language use.
 ## Note on non-Hugging Face models
 On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
 ## Model Submissions and Reproducibility
 You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
 ### For models available on the Hugging Face model hub:
 To replicate the evaluation result for a Hugging Face model:
 1) Clone the Repository
 ```python
 git lfs install
+git clone https://huggingface.co/spaces/Simondon/HumanLikeness
 ```
 2) Install the Requirements
 ```python

src/display/formatting.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from datetime import datetime, timezone
 from huggingface_hub import HfApi
 from huggingface_hub.hf_api import ModelInfo
@@ -29,7 +30,8 @@ def styled_message(message):
 def has_no_nan_values(df, columns):
-    return df[columns].notna().all(axis=1)
 def has_nan_values(df, columns):

 import os
 from datetime import datetime, timezone
+import numpy as np
 from huggingface_hub import HfApi
 from huggingface_hub.hf_api import ModelInfo
 def has_no_nan_values(df, columns):
+    return df.iloc[:, 2].apply(lambda x: not any(np.isnan(val) for val in x))
 def has_nan_values(df, columns):

src/envs.py CHANGED Viewed

@@ -6,12 +6,8 @@ from huggingface_hub import HfApi
 # replace this with our token
 # TOKEN = os.environ.get("HF_TOKEN", None)
 TOKEN = os.getenv("H4_TOKEN")
-print("token:", TOKEN)
-# print(TOKEN)
-# OWNER = "vectara"
-# REPO_ID = f"{OWNER}/Humanlike"
-# QUEUE_REPO = f"{OWNER}/requests"
-# RESULTS_REPO = f"{OWNER}/results"
 OWNER = "Simondon" # Change to your org - don't forget to create a results and request dataset, with the correct format!

 # replace this with our token
 # TOKEN = os.environ.get("HF_TOKEN", None)
 TOKEN = os.getenv("H4_TOKEN")
+TOKEN1 = os.getenv("H4_TOKEN1")
+# print("H4_token:", TOKEN)
 OWNER = "Simondon" # Change to your org - don't forget to create a results and request dataset, with the correct format!

src/leaderboard/read_evals.py CHANGED Viewed

@@ -155,23 +155,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     model_result_filepaths = []
     print("results_path", results_path)
     for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
         print("file",files)
-        # if not files or any([not f.endswith(".json") for f in files]):
-        #     continue
         for f in files:
             if f.endswith(".json"):
-        # Sort the files by date
-                # try:
-                #     files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-                # except dateutil.parser._parser.ParserError:
-                #     files = [files[-1]]
                 model_result_filepaths.extend([os.path.join(root, f)])
-    print("model_result_filepaths", model_result_filepaths)
     # exit()
     eval_results = {}
     for model_result_filepath in model_result_filepaths:

     model_result_filepaths = []
     print("results_path", results_path)
     for root, _, files in os.walk(results_path):
         print("file",files)
         for f in files:
             if f.endswith(".json"):
                 model_result_filepaths.extend([os.path.join(root, f)])
+    print("model_result_filepaths:", model_result_filepaths)
     # exit()
     eval_results = {}
     for model_result_filepath in model_result_filepaths:

src/populate.py CHANGED Viewed

@@ -11,19 +11,19 @@ import src.leaderboard.read_evals as read_evals
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     print(results_path, requests_path)
     raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
-    print("raw_data：",raw_data)
     all_data_json = [v.to_dict() for v in raw_data]
-    print(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
-    print(df)
     # exit()
     df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[formatting.has_no_nan_values(df, benchmark_cols)]
-    return raw_data, df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:

 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     print(results_path, requests_path)
     raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
+    #print("raw_data：",raw_data)
     all_data_json = [v.to_dict() for v in raw_data]
+    #print(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
+    print("all results:",df)
     # exit()
     df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[formatting.has_no_nan_values(df, benchmark_cols)]
+    return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: