Spaces:
Sleeping
Sleeping
XufengDuan
commited on
Commit
•
d24f6e8
1
Parent(s):
c0bd00c
update scripts
Browse files- src/backend/evaluate_model.py +21 -38
- src/backend/model_operations.py +187 -740
- src/backend/util.py +1 -79
- src/display/about.py +19 -36
- src/display/utils.py +1 -1
- src/envs.py +0 -2
- src/leaderboard/read_evals.py +3 -4
- src/populate.py +0 -1
src/backend/evaluate_model.py
CHANGED
@@ -5,7 +5,7 @@ import csv
|
|
5 |
|
6 |
import src.envs as envs
|
7 |
|
8 |
-
from src.backend.model_operations import
|
9 |
import src.backend.util as util
|
10 |
|
11 |
logging.basicConfig(level=logging.INFO,
|
@@ -26,7 +26,7 @@ class Evaluator:
|
|
26 |
limit (int): Limit on the number of items to process.
|
27 |
write_out (bool): Whether to write results to a file.
|
28 |
output_base_path (str): Base path for output files.
|
29 |
-
|
30 |
eval_model (EvaluationModel): Instance for evaluating summaries.
|
31 |
"""
|
32 |
def __init__(self, model, revision, precision, batch_size,
|
@@ -56,8 +56,8 @@ class Evaluator:
|
|
56 |
self.write_out = write_out
|
57 |
self.output_base_path = output_base_path
|
58 |
try:
|
59 |
-
self.
|
60 |
-
self.eval_model = EvaluationModel(
|
61 |
except Exception as e:
|
62 |
logging.error(f"Error initializing Evaluator: {e}")
|
63 |
raise
|
@@ -81,10 +81,10 @@ class Evaluator:
|
|
81 |
# print(envs.DATASET_PATH)
|
82 |
# print(df.shape)
|
83 |
# print(df.iloc[-1])
|
84 |
-
self.
|
85 |
# exit()
|
86 |
-
#
|
87 |
-
# answer_rate = self.
|
88 |
envs.API.upload_file(
|
89 |
path_or_fileobj=f"./generation_results/{self.model}.csv",
|
90 |
path_in_repo=f"{self.model}.csv",
|
@@ -93,7 +93,7 @@ class Evaluator:
|
|
93 |
)
|
94 |
|
95 |
'''开始评估模型的结果'''
|
96 |
-
self.humanlike = self.eval_model.evaluate_humanlike(self.
|
97 |
|
98 |
all_results = self.humanlike
|
99 |
# Prepare individual experiment scores and CIs
|
@@ -111,23 +111,6 @@ class Evaluator:
|
|
111 |
overall_ci=all_results['overall']['confidence_interval'],
|
112 |
**experiment_results # Unpack the experiment results
|
113 |
)
|
114 |
-
|
115 |
-
'''原始指标'''
|
116 |
-
|
117 |
-
# self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
|
118 |
-
# self.generated_summaries_df)
|
119 |
-
# factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
|
120 |
-
# hallucination_rate = self.eval_model.hallucination_rate
|
121 |
-
# factual_consistency_rate = 0
|
122 |
-
# answer_rate = 0
|
123 |
-
# avg_summary_len = 0
|
124 |
-
#
|
125 |
-
# results = util.format_results(model_name=self.model, revision=self.revision,
|
126 |
-
# precision=self.precision,
|
127 |
-
# factual_consistency_rate=factual_consistency_rate,
|
128 |
-
# hallucination_rate=self.humanlike,
|
129 |
-
# answer_rate=answer_rate,
|
130 |
-
# avg_summary_len=avg_summary_len)
|
131 |
return results
|
132 |
except FileNotFoundError:
|
133 |
logging.error(f"File not found: {envs.DATASET_PATH}")
|
@@ -145,28 +128,28 @@ class Evaluator:
|
|
145 |
logging.error(f"Need to first download the results from google drive to the learderboard folder")
|
146 |
raise
|
147 |
|
148 |
-
|
149 |
|
150 |
-
# #update
|
151 |
# #first remove previous results for the current model
|
152 |
-
# existing_df = pd.read_csv(os.path.join(working_path, '
|
153 |
# mask = existing_df['model'] == self.model
|
154 |
# existing_df = existing_df[~mask]
|
155 |
# # get new result
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
print('
|
160 |
|
161 |
-
# update
|
162 |
# BUG: get error when opening the file
|
163 |
-
# existing_df = pd.read_csv(os.path.join(working_path, '
|
164 |
# encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
|
165 |
# print(existing_df.shape)
|
166 |
# mask = existing_df['model'] == self.model
|
167 |
# existing_df = existing_df[~mask]
|
168 |
# get new result
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
print('
|
|
|
5 |
|
6 |
import src.envs as envs
|
7 |
|
8 |
+
from src.backend.model_operations import ResponseGenerator, EvaluationModel
|
9 |
import src.backend.util as util
|
10 |
|
11 |
logging.basicConfig(level=logging.INFO,
|
|
|
26 |
limit (int): Limit on the number of items to process.
|
27 |
write_out (bool): Whether to write results to a file.
|
28 |
output_base_path (str): Base path for output files.
|
29 |
+
response_generator (ResponseGenerator): Instance for generating summaries.
|
30 |
eval_model (EvaluationModel): Instance for evaluating summaries.
|
31 |
"""
|
32 |
def __init__(self, model, revision, precision, batch_size,
|
|
|
56 |
self.write_out = write_out
|
57 |
self.output_base_path = output_base_path
|
58 |
try:
|
59 |
+
self.response_generator = ResponseGenerator(model, revision)
|
60 |
+
self.eval_model = EvaluationModel()
|
61 |
except Exception as e:
|
62 |
logging.error(f"Error initializing Evaluator: {e}")
|
63 |
raise
|
|
|
81 |
# print(envs.DATASET_PATH)
|
82 |
# print(df.shape)
|
83 |
# print(df.iloc[-1])
|
84 |
+
self.generated_responses_df = self.response_generator.generate_response(envs.DATASET_PATH, df_prompt, save_path=f"./generation_results/{self.model}.csv")
|
85 |
# exit()
|
86 |
+
# avg_response_len = self.response_generator.avg_length
|
87 |
+
# answer_rate = self.response_generator.answer_rate
|
88 |
envs.API.upload_file(
|
89 |
path_or_fileobj=f"./generation_results/{self.model}.csv",
|
90 |
path_in_repo=f"{self.model}.csv",
|
|
|
93 |
)
|
94 |
|
95 |
'''开始评估模型的结果'''
|
96 |
+
self.humanlike = self.eval_model.evaluate_humanlike(self.generated_responses_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv")
|
97 |
|
98 |
all_results = self.humanlike
|
99 |
# Prepare individual experiment scores and CIs
|
|
|
111 |
overall_ci=all_results['overall']['confidence_interval'],
|
112 |
**experiment_results # Unpack the experiment results
|
113 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
return results
|
115 |
except FileNotFoundError:
|
116 |
logging.error(f"File not found: {envs.DATASET_PATH}")
|
|
|
128 |
logging.error(f"Need to first download the results from google drive to the learderboard folder")
|
129 |
raise
|
130 |
|
131 |
+
source_response_df = self.generated_responses_df[["user_prompt", "response"]]
|
132 |
|
133 |
+
# #update leaderboard_responses.csv
|
134 |
# #first remove previous results for the current model
|
135 |
+
# existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses.csv'), encoding='utf-8', sep="\t")
|
136 |
# mask = existing_df['model'] == self.model
|
137 |
# existing_df = existing_df[~mask]
|
138 |
# # get new result
|
139 |
+
leaderboard_responses_df = source_response_df
|
140 |
+
leaderboard_responses_df.insert(2, "model", [self.model]*leaderboard_responses_df.shape[0])
|
141 |
+
leaderboard_responses_df.to_csv(os.path.join(working_path, 'leaderboard_responses.csv'), mode='a', index=False, header=False)
|
142 |
+
print('leaderboard_responses.csv has been updated')
|
143 |
|
144 |
+
# update leaderboard_responses_with_scores.csv
|
145 |
# BUG: get error when opening the file
|
146 |
+
# existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'),
|
147 |
# encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
|
148 |
# print(existing_df.shape)
|
149 |
# mask = existing_df['model'] == self.model
|
150 |
# existing_df = existing_df[~mask]
|
151 |
# get new result
|
152 |
+
leaderboard_responses_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
|
153 |
+
leaderboard_responses_with_scores_df.insert(3, "model", [self.model]*leaderboard_responses_with_scores_df.shape[0])
|
154 |
+
leaderboard_responses_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_responses_with_scores.csv'), mode='a', index=False, header=False)
|
155 |
+
print('leaderboard_responses_with_scores.csv has been updated')
|
src/backend/model_operations.py
CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
6 |
import requests
|
7 |
import json
|
8 |
|
9 |
-
import numpy as np
|
10 |
import pandas as pd
|
11 |
import spacy
|
12 |
from sentence_transformers import CrossEncoder
|
@@ -43,7 +43,7 @@ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=Tr
|
|
43 |
# Load spacy model for word tokenization
|
44 |
# nlp = spacy.load("en_core_web_sm")
|
45 |
try:
|
46 |
-
nlp1 = spacy.load("
|
47 |
except OSError:
|
48 |
print("无法加载模型,继续执行其他处理。")
|
49 |
|
@@ -55,22 +55,6 @@ logging.basicConfig(level=logging.INFO,
|
|
55 |
|
56 |
|
57 |
|
58 |
-
# os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
59 |
-
|
60 |
-
def load_evaluation_model(model_path):
|
61 |
-
"""Load the evaluation model from the given path
|
62 |
-
|
63 |
-
Args:
|
64 |
-
model_path (str): Path to the evaluation model
|
65 |
-
|
66 |
-
Returns:
|
67 |
-
CrossEncoder: The evaluation model
|
68 |
-
"""
|
69 |
-
# model = CrossEncoder(model_path)
|
70 |
-
model = ""
|
71 |
-
return model
|
72 |
-
|
73 |
-
|
74 |
class ModelLoadingException(Exception):
|
75 |
"""Exception raised for errors in loading a model.
|
76 |
|
@@ -85,21 +69,21 @@ class ModelLoadingException(Exception):
|
|
85 |
super().__init__(f"{messages} id={model_id} revision={revision}")
|
86 |
|
87 |
|
88 |
-
class
|
89 |
-
"""A class to generate
|
90 |
|
91 |
Attributes:
|
92 |
model (str): huggingface/{model_id}
|
93 |
api_base (str): https://api-inference.huggingface.co/models/{model_id}
|
94 |
-
|
95 |
revision (str): Model revision.
|
96 |
-
avg_length (float): Average length of
|
97 |
-
answer_rate (float): Rate of non-empty
|
98 |
"""
|
99 |
|
100 |
def __init__(self, model_id, revision):
|
101 |
"""
|
102 |
-
Initializes the
|
103 |
|
104 |
Args:
|
105 |
model_id (str): Identifier for the model.
|
@@ -108,29 +92,28 @@ class SummaryGenerator:
|
|
108 |
self.model_id = model_id
|
109 |
self.model = f"huggingface/{model_id}"
|
110 |
self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
|
111 |
-
self.
|
112 |
self.revision = revision
|
113 |
self.avg_length = None
|
114 |
self.answer_rate = None
|
115 |
self.exceptions = None
|
116 |
self.local_model = None
|
117 |
|
118 |
-
def
|
119 |
-
"""Generate
|
120 |
-
修改这里拉取模型生成结果
|
121 |
Args:
|
122 |
dataset (DataFrame): DataFrame containing source docs.
|
123 |
|
124 |
Returns:
|
125 |
-
|
126 |
"""
|
127 |
exceptions = []
|
128 |
if (save_path is not None) and os.path.exists(save_path):
|
129 |
'''已存在文件,可以读取已经存在的测试文本'''
|
130 |
-
self.
|
131 |
-
# print(self.
|
132 |
|
133 |
-
print(f'Loaded generated
|
134 |
else:
|
135 |
'''测试文件不存在,则需要调用指定的模型来进行测试'''
|
136 |
# prompt = {}
|
@@ -193,9 +176,9 @@ class SummaryGenerator:
|
|
193 |
while True:
|
194 |
try:
|
195 |
'''调用'''
|
196 |
-
print(ID,'-',j,'-',ii)
|
197 |
|
198 |
-
_response = self.
|
199 |
# print(f"Finish index {index}")
|
200 |
break
|
201 |
except Exception as e:
|
@@ -221,7 +204,7 @@ class SummaryGenerator:
|
|
221 |
print(f"Error at index {i}: {e}")
|
222 |
time.sleep(wait_time)
|
223 |
try:
|
224 |
-
_response = self.
|
225 |
break
|
226 |
except Exception as ee:
|
227 |
exceptions.append(ee)
|
@@ -236,45 +219,46 @@ class SummaryGenerator:
|
|
236 |
break
|
237 |
if i == 5:
|
238 |
#print(_response)
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
def extract_responses(text, trigger_words=None):
|
241 |
if trigger_words is None:
|
242 |
trigger_words = ["sure", "okay", "yes"]
|
243 |
|
244 |
try:
|
|
|
245 |
sentences = text.split('\n')
|
246 |
-
|
247 |
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
248 |
-
|
249 |
sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
|
250 |
sentence in sentences]
|
251 |
-
|
|
|
|
|
|
|
|
|
252 |
_response1 = sentences[1].strip() if len(sentences) > 1 else None
|
253 |
_response2 = sentences[2].strip() if len(sentences) > 2 else None
|
254 |
else:
|
255 |
_response1 = sentences[0].strip() if len(sentences) > 0 else None
|
256 |
_response2 = sentences[1].strip() if len(sentences) > 1 else None
|
257 |
|
|
|
258 |
except Exception as e:
|
259 |
print(f"Error occurred: {e}")
|
260 |
_response1, _response2 = None, None
|
261 |
|
|
|
262 |
|
263 |
return _response1, _response2
|
264 |
|
265 |
_response1, _response2 = extract_responses(_response)
|
266 |
-
# if _response == None:
|
267 |
-
# _response1, _response2 = "", ""
|
268 |
-
# else:
|
269 |
-
# try:
|
270 |
-
# import re
|
271 |
-
# _response1,_response2 = re.split(r'\n\s*\n', _response.strip())
|
272 |
-
# except:
|
273 |
-
# _response1 = _response.split('\n\n')
|
274 |
-
# if len(_response) == 2:
|
275 |
-
# _response1, _response2 = _response[0], _response[1]
|
276 |
-
# else:
|
277 |
-
# _response1, _response2 = _response[0], ""
|
278 |
|
279 |
Experiment_ID.append(ID)
|
280 |
Questions_ID.append(q_column[j])
|
@@ -309,30 +293,26 @@ class SummaryGenerator:
|
|
309 |
Stimuli_1.append(Stimuli_1_column[j])
|
310 |
Item_ID.append(Item_column[j])
|
311 |
Condition.append(Condition_column[j])
|
312 |
-
#print(_response)
|
313 |
-
|
314 |
-
|
315 |
-
# exit()
|
316 |
|
317 |
# Sleep to prevent hitting rate limits too frequently
|
318 |
time.sleep(1)
|
319 |
|
320 |
-
self.
|
321 |
-
|
322 |
|
323 |
if save_path is not None:
|
324 |
-
print(f'Save
|
325 |
fpath = Path(save_path)
|
326 |
fpath.parent.mkdir(parents=True, exist_ok=True)
|
327 |
-
self.
|
328 |
|
329 |
self.exceptions = exceptions
|
330 |
# self._compute_avg_length()
|
331 |
# self._compute_answer_rate()
|
332 |
|
333 |
-
return self.
|
334 |
|
335 |
-
def
|
336 |
# Using Together AI API
|
337 |
using_together_api = False
|
338 |
together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
|
@@ -354,17 +334,9 @@ class SummaryGenerator:
|
|
354 |
"model": self.model_id,
|
355 |
# "max_tokens": 4096,
|
356 |
'max_new_tokens': 100,
|
357 |
-
# "
|
358 |
# 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
|
359 |
}
|
360 |
-
# if 'mixtral' in self.model_id.lower():
|
361 |
-
# # payload['prompt'] = user_prompt
|
362 |
-
# # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
|
363 |
-
# payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
|
364 |
-
# print(payload)
|
365 |
-
# else:
|
366 |
-
# payload['messages'] = [{"role": "system", "content": system_prompt},
|
367 |
-
# {"role": "user", "content": user_prompt}]
|
368 |
payload['messages'] = [{"role": "system", "content": system_prompt},
|
369 |
{"role": "user", "content": user_prompt}]
|
370 |
headers = {
|
@@ -462,82 +434,13 @@ class SummaryGenerator:
|
|
462 |
continue
|
463 |
|
464 |
raise Exception("All tokens failed.")
|
465 |
-
|
466 |
-
# print(self.api_base)
|
467 |
-
# mistralai/Mistral-7B-Instruct-v0.1
|
468 |
-
# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
|
469 |
-
# Using HF API or download checkpoints
|
470 |
-
# try: # try use HuggingFace API
|
471 |
-
# from huggingface_hub import InferenceClient
|
472 |
-
# print("token_for_request:",envs.TOKEN)
|
473 |
-
# print(self.model_id)
|
474 |
-
# client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
|
475 |
-
# messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
|
476 |
-
# # outputs = client.chat_completion(messages, max_tokens=100)
|
477 |
-
# result = None
|
478 |
-
# while result is None:
|
479 |
-
# outputs = client.chat_completion(messages, max_tokens=100)
|
480 |
-
# result = outputs['choices'][0]['message']['content']
|
481 |
-
#
|
482 |
-
# if result is None:
|
483 |
-
# time.sleep(1) # Optional: Add a small delay before retrying
|
484 |
-
#
|
485 |
-
# return result
|
486 |
-
#
|
487 |
-
# except Exception as e:
|
488 |
-
# print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
|
489 |
-
# try:
|
490 |
-
# client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
|
491 |
-
# messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
|
492 |
-
# result = None
|
493 |
-
# while result is None:
|
494 |
-
# outputs = client.chat_completion(messages, max_tokens=100)
|
495 |
-
# result = outputs['choices'][0]['message']['content']
|
496 |
-
#
|
497 |
-
# if result is None:
|
498 |
-
# time.sleep(1) # Optional: Add a small delay before retrying
|
499 |
-
#
|
500 |
-
# return result
|
501 |
-
# except Exception as ee:
|
502 |
-
# print(f"Error with TOKEN1: {envs.TOKEN1}")
|
503 |
-
# raise ee
|
504 |
-
|
505 |
-
|
506 |
-
# except: # fail to call api. run it locally.
|
507 |
-
# self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
|
508 |
-
# print("Tokenizer loaded")
|
509 |
-
# self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
|
510 |
-
# print("Local model loaded")
|
511 |
-
# response = litellm.completion(
|
512 |
-
# model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
|
513 |
-
# messages=[{"role": "system", "content": system_prompt},
|
514 |
-
# {"role": "user", "content": user_prompt}],
|
515 |
-
# temperature=0.0,
|
516 |
-
# max_tokens=1024,
|
517 |
-
# api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
|
518 |
-
# )
|
519 |
-
# self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
|
520 |
-
# response = litellm.completion(
|
521 |
-
# model="huggingface/" + self.model_id,
|
522 |
-
# # mistralai/Mistral-7B-Instruct-v0.1",
|
523 |
-
# messages=[{"role": "system", "content": system_prompt},
|
524 |
-
# {"role": "user", "content": user_prompt}],
|
525 |
-
# #temperature=0.0,
|
526 |
-
# max_tokens=1024,
|
527 |
-
# api_base="https://api-inference.huggingface.co/models/" + self.model_id)
|
528 |
-
# print("模型返回结果",response)
|
529 |
-
# print("模型返回结果结束")
|
530 |
-
# # exit()
|
531 |
-
# result = response['choices'][0]['message']['content']
|
532 |
-
# print(result)
|
533 |
-
# exit()
|
534 |
-
# Using Google AI API for Gemini models
|
535 |
elif 'gemini' in self.model_id.lower():
|
536 |
genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
|
537 |
generation_config = {
|
538 |
-
"temperature": 0,
|
539 |
-
"top_p": 0.95, # cannot change
|
540 |
-
"top_k": 0,
|
541 |
"max_output_tokens": 100,
|
542 |
# "response_mime_type": "application/json",
|
543 |
}
|
@@ -589,58 +492,28 @@ class SummaryGenerator:
|
|
589 |
# Using local model
|
590 |
|
591 |
|
592 |
-
def _compute_avg_length(self):
|
593 |
-
"""
|
594 |
-
Compute the average length of non-empty summaries using SpaCy.
|
595 |
-
"""
|
596 |
-
total_word_count = 0
|
597 |
-
total_count = 0
|
598 |
-
|
599 |
-
for summary in self.summaries_df['summary']:
|
600 |
-
if util.is_summary_valid(summary):
|
601 |
-
doc = nlp1(summary)
|
602 |
-
words = [token.text for token in doc if token.is_alpha]
|
603 |
-
total_word_count += len(words)
|
604 |
-
total_count += 1
|
605 |
-
|
606 |
-
self.avg_length = 0 if total_count == 0 else total_word_count / total_count
|
607 |
-
|
608 |
-
def _compute_answer_rate(self):
|
609 |
-
"""
|
610 |
-
Compute the rate of non-empty summaries.
|
611 |
-
"""
|
612 |
-
valid_count = sum(1 for summary in self.summaries_df['summary']
|
613 |
-
if util.is_summary_valid(summary))
|
614 |
-
|
615 |
-
total_count = len(self.summaries_df)
|
616 |
-
|
617 |
-
self.answer_rate = 0 if total_count == 0 else valid_count / total_count
|
618 |
|
619 |
|
620 |
class EvaluationModel:
|
621 |
-
"""A class to evaluate generated
|
622 |
|
623 |
Attributes:
|
624 |
model (CrossEncoder): The evaluation model.
|
625 |
-
scores (list): List of
|
626 |
-
|
627 |
-
|
|
|
628 |
"""
|
629 |
|
630 |
def __init__(self, model_path):
|
631 |
"""
|
632 |
-
Initializes the EvaluationModel
|
633 |
-
|
634 |
-
Args:
|
635 |
-
model_path (str): Path to the CrossEncoder model.
|
636 |
"""
|
637 |
-
self.model = load_evaluation_model(model_path)
|
638 |
self.scores = []
|
639 |
-
self.factual_consistency_rate = None
|
640 |
-
self.hallucination_rate = None
|
641 |
self.humanlike_score = None
|
642 |
|
643 |
-
def
|
644 |
'''code results from LLM's response'''
|
645 |
output = []
|
646 |
'''database for Exp4'''
|
@@ -661,28 +534,27 @@ class EvaluationModel:
|
|
661 |
Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
|
662 |
Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
|
663 |
|
664 |
-
|
665 |
male_keyword = ["he", "his", "himself"]
|
666 |
female_keyword = ["she", "her", "herself"]
|
667 |
-
print(len(
|
668 |
-
for i in range(len(
|
|
|
669 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
670 |
# print()
|
671 |
-
if pd.isna(
|
672 |
output.append("Other")
|
673 |
continue
|
674 |
-
rs =
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
rs =
|
|
|
681 |
'''Exp1'''
|
682 |
-
|
683 |
-
|
684 |
-
print("E1", rs)
|
685 |
-
rs = rs.replace('"','')
|
686 |
if rs == "round":
|
687 |
# vote_1_1 += 1
|
688 |
output.append("Round")
|
@@ -691,13 +563,12 @@ class EvaluationModel:
|
|
691 |
else:
|
692 |
output.append("Other")
|
693 |
|
694 |
-
|
695 |
'''Exp2'''
|
696 |
-
|
697 |
-
elif
|
698 |
-
# rs =
|
699 |
rs = rs.split(' ')
|
700 |
-
print("E2", rs)
|
701 |
male, female = 0, 0
|
702 |
for word in rs:
|
703 |
if word in female_keyword and male == 0:
|
@@ -708,323 +579,63 @@ class EvaluationModel:
|
|
708 |
male = 1
|
709 |
output.append("Male")
|
710 |
break
|
711 |
-
if male == 0 and female == 0
|
712 |
output.append("Other")
|
713 |
|
714 |
'''Exp3'''
|
715 |
-
|
716 |
-
|
717 |
-
# rs
|
718 |
-
|
719 |
-
|
720 |
-
output.append("Other")
|
721 |
-
else:
|
722 |
-
if summaries_df["Factor 2"][i].strip() == "LS":
|
723 |
-
if "2" in rs:
|
724 |
-
output.append("Long")
|
725 |
-
elif "3" in rs:
|
726 |
-
output.append("Short")
|
727 |
-
else:
|
728 |
-
output.append("Other")
|
729 |
-
if summaries_df["Factor 2"][i].strip() == "SL":
|
730 |
-
if "2" in rs:
|
731 |
-
output.append("Short")
|
732 |
-
elif "3" in rs:
|
733 |
-
output.append("Long")
|
734 |
-
else:
|
735 |
-
output.append("Other")
|
736 |
-
'''Exp4'''
|
737 |
-
|
738 |
-
elif summaries_df["Experiment"][i] == "E4":
|
739 |
-
# rs = summaries_df["Response"][i].strip()
|
740 |
-
target = summaries_df["Factor 2"][i].strip().lower()
|
741 |
-
pair = target + "_" + rs
|
742 |
-
print("E4:", pair)
|
743 |
-
if pair in wordpair2code.keys():
|
744 |
-
output.append(wordpair2code[pair])
|
745 |
-
else:
|
746 |
-
output.append("Other")
|
747 |
-
|
748 |
-
'''Exp5'''
|
749 |
-
elif summaries_df["Experiment"][i] == "E5" or summaries_df["Experiment"][i] == "E51":
|
750 |
-
# sentence = summaries_df["Response"][i].strip()
|
751 |
-
item_id = summaries_df["Item"][i]
|
752 |
-
question_id = summaries_df["Question_ID"][i]
|
753 |
-
|
754 |
-
sti1, sti2 = "", ""
|
755 |
-
|
756 |
-
if summaries_df["Experiment"][i] == "E51":
|
757 |
-
sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
|
758 |
-
sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
|
759 |
-
verb = item2verb1[item_id].lower()
|
760 |
|
761 |
-
|
762 |
-
print("E5", verb, sentence)
|
763 |
-
if summaries_df["Experiment"][i] == "E5":
|
764 |
-
sti1 = Stimuli1[question_id].lower().replace("...", "")
|
765 |
-
# print(sti1)
|
766 |
-
sti2 = Stimuli2[question_id].lower().replace("...", "")
|
767 |
-
|
768 |
-
verb = item2verb2[item_id].lower()
|
769 |
-
sentence = sti2.replace("...","") + " " + rs.replace(sti2, "")
|
770 |
-
print("E5", verb, sentence)
|
771 |
-
|
772 |
-
|
773 |
-
doc = nlp1(sentence.replace(" "," "))
|
774 |
-
# print(doc)
|
775 |
-
# print()
|
776 |
-
verb_token = None
|
777 |
-
for token in doc:
|
778 |
-
# print(token.lemma_)
|
779 |
-
if token.lemma_ == verb:
|
780 |
-
verb_token = token
|
781 |
-
break
|
782 |
-
# exit()
|
783 |
-
if verb_token is None:
|
784 |
output.append("Other")
|
785 |
-
print("E5 The target verb is missing from the sentence.")
|
786 |
else:
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
for child in verb_token.children:
|
791 |
-
print(child)
|
792 |
-
if (child.dep_ == 'dative' and child.pos_ == "ADP") or (child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
|
793 |
-
pobj = child.text
|
794 |
-
if child.dep_ == 'dative':
|
795 |
-
dative = child.text
|
796 |
-
print("E5", pobj, dative)
|
797 |
-
# exit()
|
798 |
-
|
799 |
-
if pobj:
|
800 |
-
output.append("PO")
|
801 |
-
elif dative:
|
802 |
-
output.append("DO")
|
803 |
-
else:
|
804 |
-
print("Other", sentence, pobj, dative)
|
805 |
-
# exit()
|
806 |
output.append("Other")
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
if token.dep_ == "nsubj":
|
819 |
-
subject = token.text
|
820 |
-
elif token.dep_ == "dobj":
|
821 |
-
obj = token.text
|
822 |
-
print("E6", subject, obj)
|
823 |
-
if subject in rs and obj in rs:
|
824 |
-
print(rs, subject, obj, "Other")
|
825 |
-
output.append("Other")
|
826 |
-
elif subject in rs:
|
827 |
-
print(rs, subject, obj, "VP")
|
828 |
-
output.append("VP")
|
829 |
-
elif obj in rs:
|
830 |
-
print(rs, subject, obj, "NP")
|
831 |
-
output.append("NP")
|
832 |
-
else:
|
833 |
-
print(rs, subject, obj, "Other")
|
834 |
-
output.append("Other")
|
835 |
-
|
836 |
-
|
837 |
-
|
838 |
-
|
839 |
-
'''Exp7'''
|
840 |
-
elif summaries_df["Experiment"][i] == "E7":
|
841 |
-
# rs = summaries_df["Response"][i].strip().lower()
|
842 |
-
print("E7",rs)
|
843 |
-
if rs == "no":
|
844 |
-
output.append("0")
|
845 |
-
elif rs == "yes":
|
846 |
-
output.append("1")
|
847 |
-
else:
|
848 |
-
output.append("Other")
|
849 |
-
|
850 |
-
'''Exp8'''
|
851 |
-
elif summaries_df["Experiment"][i] == "E8":
|
852 |
-
# rs = summaries_df["Response"][i].strip()
|
853 |
-
|
854 |
-
if "something is wrong with the question" in rs:
|
855 |
-
output.append("1")
|
856 |
-
else:
|
857 |
-
output.append("0")
|
858 |
-
|
859 |
-
'''Exp9'''
|
860 |
-
elif summaries_df["Experiment"][i] == "E9":
|
861 |
-
male, female = 0, 0
|
862 |
-
|
863 |
-
# rs = summaries_df["Response"][i].strip()
|
864 |
-
if "because" in rs:
|
865 |
-
rs = rs.replace("because because","because").split("because")[1]
|
866 |
-
else:
|
867 |
-
rs = rs
|
868 |
-
condition = summaries_df["Factor 2"][i].strip()
|
869 |
-
rs = rs.split(" ")
|
870 |
-
for w in rs:
|
871 |
-
if w in male_keyword and female != 1:
|
872 |
-
male = 1
|
873 |
-
break
|
874 |
-
if w in female_keyword and male != 1:
|
875 |
-
female = 1
|
876 |
-
break
|
877 |
-
print("E9", "condition", condition, "male", male, "female", female)
|
878 |
-
if male == 0 and female == 0:
|
879 |
-
output.append('Other')
|
880 |
-
else:
|
881 |
-
if male == 1 and female==0:
|
882 |
-
if condition == "MF":
|
883 |
-
output.append("Subject")
|
884 |
-
elif condition == "FM":
|
885 |
-
output.append("Object")
|
886 |
-
else:
|
887 |
-
output.append("Other")
|
888 |
-
elif female == 1 and male ==0:
|
889 |
-
if condition == "MF":
|
890 |
-
output.append("Object")
|
891 |
-
elif condition == "FM":
|
892 |
-
output.append("Subject")
|
893 |
else:
|
894 |
output.append("Other")
|
895 |
|
896 |
-
'''Exp10'''
|
897 |
-
elif summaries_df["Experiment"][i] == "E10":
|
898 |
-
# rs = summaries_df["Response"][i].strip()
|
899 |
-
if rs == "yes":
|
900 |
-
output.append("1")
|
901 |
-
else:
|
902 |
-
output.append("0")
|
903 |
-
else:
|
904 |
-
print("can;t find the Exp:", summaries_df["Experiment"][i])
|
905 |
-
output.append("NA")
|
906 |
-
# print(output)
|
907 |
-
# exit()
|
908 |
-
'''human'''
|
909 |
-
self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], summaries_df["Coding"], output)),
|
910 |
-
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
|
911 |
-
# '''LLM'''
|
912 |
-
# self.data = pd.DataFrame(list(zip(summaries_df["Experiment"], summaries_df["Question_ID"], summaries_df["Item"], summaries_df["Response"], summaries_df["Factor 2"], summaries_df["Stimuli 1"], output)),
|
913 |
-
# columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Coding"])
|
914 |
-
print(self.data.head())
|
915 |
-
|
916 |
-
return self.data
|
917 |
-
def code_results_llm(self, summaries_df):
|
918 |
-
'''code results from LLM's response'''
|
919 |
-
output = []
|
920 |
-
'''database for Exp4'''
|
921 |
-
item4 = pd.read_csv(envs.ITEM_4_DATA)
|
922 |
-
wordpair2code = {}
|
923 |
-
for j in range(len(item4['Coding'])):
|
924 |
-
wordpair2code[item4['Pair'][j]] = item4['Coding'][j]
|
925 |
-
'''verb for Exp5'''
|
926 |
-
item5 = pd.read_csv(envs.ITEM_5_DATA)
|
927 |
-
# item corresponding to verb, same item id corresponding to verb pair
|
928 |
-
item2verb2 = {}
|
929 |
-
item2verb1 = {}
|
930 |
-
|
931 |
-
Stimuli1, Stimuli2 = {}, {}
|
932 |
-
for j in range(len(item5['Item'])):
|
933 |
-
item2verb1[item5['Item'][j]] = item5['Verb1'][j]
|
934 |
-
item2verb2[item5['Item'][j]] = item5['Verb2'][j]
|
935 |
-
Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
|
936 |
-
Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
|
937 |
-
|
938 |
-
|
939 |
-
male_keyword = ["he", "his", "himself"]
|
940 |
-
female_keyword = ["she", "her", "herself"]
|
941 |
-
print(len(summaries_df["Experiment"]))
|
942 |
-
for i in range(len(summaries_df["Experiment"])):
|
943 |
-
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
944 |
-
# print()
|
945 |
-
# data cleaning
|
946 |
-
if pd.isna(summaries_df["Response"][i]):
|
947 |
-
output.append("Other")
|
948 |
-
continue
|
949 |
-
rs = summaries_df["Response"][i].strip().lower()
|
950 |
-
sentences = rs.split('\n')
|
951 |
-
sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
|
952 |
-
for sentence in sentences]
|
953 |
-
rs = [sentence.strip() for sentence in sentences if sentence.strip()]
|
954 |
-
rs = '\n'.join(rs)
|
955 |
-
rs = rs.replace('[', '').replace(']','').replace('.','')
|
956 |
-
'''Exp1'''
|
957 |
-
# the period and comma will affect the result
|
958 |
-
if summaries_df["Experiment"][i] == "E1":
|
959 |
-
print("E1", rs)
|
960 |
-
rs = rs.replace('"', '') # Remove any unnecessary quotation marks
|
961 |
-
rs_cleaned = rs.replace(',', '') # Remove periods and commas
|
962 |
-
|
963 |
-
# Use 'contains' instead of 'equals' for keyword matching to avoid issues caused by punctuation
|
964 |
-
if "round" in rs_cleaned:
|
965 |
-
output.append("Round")
|
966 |
-
elif "spiky" in rs_cleaned:
|
967 |
-
output.append("Spiky")
|
968 |
-
else:
|
969 |
-
output.append("Other")
|
970 |
-
|
971 |
-
|
972 |
-
'''Exp2'''
|
973 |
-
|
974 |
-
elif summaries_df["Experiment"][i] == "E2":
|
975 |
-
rs = rs.split(' ')
|
976 |
-
print("E2", rs)
|
977 |
-
male, female = 0, 0
|
978 |
-
for word in rs:
|
979 |
-
if word in female_keyword and male == 0:
|
980 |
-
female = 1
|
981 |
-
output.append("Female")
|
982 |
-
break
|
983 |
-
if word in male_keyword and female == 0:
|
984 |
-
male = 1
|
985 |
-
output.append("Male")
|
986 |
-
break
|
987 |
-
if male == 0 and female == 0 :
|
988 |
-
output.append("Other")
|
989 |
-
|
990 |
-
'''Exp3'''
|
991 |
-
elif summaries_df["Experiment"][i] == "E3":
|
992 |
-
# rs = summaries_df["Response"][i].strip()
|
993 |
-
print("E3", rs)
|
994 |
-
rs = rs.replace('"', '').lower().replace(".","")
|
995 |
-
pair = summaries_df["Factor 2"][i]
|
996 |
-
word1, word2 = pair.split('_')
|
997 |
-
|
998 |
-
if rs == word1:
|
999 |
-
if len(word1) > len(word2):
|
1000 |
-
output.append("Long")
|
1001 |
-
else:
|
1002 |
-
output.append("Short")
|
1003 |
-
elif rs == word2:
|
1004 |
-
if len(word1) > len(word2):
|
1005 |
-
output.append("Short")
|
1006 |
-
else:
|
1007 |
-
output.append("Long")
|
1008 |
-
else:
|
1009 |
-
output.append("Other")
|
1010 |
-
|
1011 |
'''Exp4'''
|
1012 |
|
1013 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1014 |
try:
|
1015 |
meaning_word = rs.split(";")[4].replace(" ", '')
|
1016 |
except IndexError:
|
1017 |
-
|
1018 |
-
|
|
|
|
|
|
|
1019 |
except Exception as e:
|
1020 |
print(f"Unexpected error: {e}")
|
1021 |
output.append("Other")
|
1022 |
continue
|
1023 |
-
|
1024 |
-
|
1025 |
-
target = summaries_df["Factor 2"][i].strip().lower()
|
1026 |
pair = target + "_" + meaning_word
|
1027 |
-
print("E4:", pair)
|
1028 |
|
1029 |
if pair in wordpair2code.keys():
|
1030 |
output.append(wordpair2code[pair])
|
@@ -1032,31 +643,30 @@ class EvaluationModel:
|
|
1032 |
output.append("Other")
|
1033 |
|
1034 |
'''Exp5'''
|
1035 |
-
elif
|
1036 |
-
# sentence =
|
1037 |
-
item_id =
|
1038 |
-
question_id =
|
1039 |
|
1040 |
sti1, sti2 = "", ""
|
1041 |
|
1042 |
-
if
|
1043 |
sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
|
1044 |
-
sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
|
1045 |
verb = item2verb1[item_id].lower()
|
1046 |
|
1047 |
sentence = sti1 + " " + rs.replace(sti1, "")
|
1048 |
-
print("E5", verb, sentence)
|
1049 |
-
if
|
1050 |
-
sti1 = Stimuli1[question_id].lower().replace("...", "")
|
1051 |
-
|
1052 |
sti2 = Stimuli2[question_id].lower().replace("...", "")
|
1053 |
|
1054 |
verb = item2verb2[item_id].lower()
|
1055 |
-
sentence = sti2
|
1056 |
-
print("E5", verb, sentence)
|
1057 |
|
1058 |
-
|
1059 |
-
doc = nlp1(sentence.replace(" "," "))
|
1060 |
# print(doc)
|
1061 |
# print()
|
1062 |
verb_token = None
|
@@ -1066,102 +676,94 @@ class EvaluationModel:
|
|
1066 |
verb_token = token
|
1067 |
break
|
1068 |
# exit()
|
1069 |
-
|
1070 |
-
|
1071 |
-
|
1072 |
-
|
1073 |
-
pobj, dative = None, None
|
1074 |
-
# print(verb_token.children)
|
1075 |
-
# exit()
|
1076 |
for child in verb_token.children:
|
1077 |
-
print(child)
|
1078 |
-
if (child.dep_ == 'dative' and child.pos_ == "ADP") or (
|
|
|
1079 |
pobj = child.text
|
1080 |
if child.dep_ == 'dative':
|
1081 |
dative = child.text
|
1082 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1083 |
# exit()
|
|
|
|
|
1084 |
|
1085 |
-
if pobj:
|
1086 |
-
output.append("PO")
|
1087 |
-
elif dative:
|
1088 |
-
output.append("DO")
|
1089 |
-
else:
|
1090 |
-
print("Other", sentence, pobj, dative)
|
1091 |
-
# exit()
|
1092 |
-
output.append("Other")
|
1093 |
|
1094 |
'''Exp6'''
|
1095 |
|
1096 |
-
elif
|
1097 |
-
sentence =
|
1098 |
-
print("E6", sentence)
|
1099 |
doc = nlp1(sentence)
|
1100 |
subject = "None"
|
1101 |
obj = "None"
|
|
|
|
|
1102 |
for token in doc:
|
1103 |
if token.dep_ == "nsubj":
|
1104 |
subject = token.text
|
1105 |
elif token.dep_ == "dobj":
|
1106 |
obj = token.text
|
1107 |
-
print("E6", subject, obj)
|
1108 |
if subject in rs and obj in rs:
|
1109 |
-
print(rs, subject, obj, "Other")
|
1110 |
output.append("Other")
|
1111 |
elif subject in rs:
|
1112 |
-
print(rs, subject, obj, "VP")
|
1113 |
output.append("VP")
|
1114 |
elif obj in rs:
|
1115 |
-
print(rs, subject, obj, "NP")
|
1116 |
output.append("NP")
|
1117 |
else:
|
1118 |
-
print(rs, subject, obj, "Other")
|
1119 |
output.append("Other")
|
1120 |
|
1121 |
-
|
1122 |
-
|
1123 |
-
|
1124 |
'''Exp7'''
|
1125 |
-
elif
|
1126 |
-
|
1127 |
rs = rs.replace(".", "").replace(",", "").lower()
|
1128 |
-
print("E7", rs)
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
-
|
1135 |
-
|
1136 |
-
output.append("0")
|
1137 |
-
found = True
|
1138 |
-
break
|
1139 |
-
elif word == "yes":
|
1140 |
-
output.append("1")
|
1141 |
-
found = True
|
1142 |
-
break
|
1143 |
-
if not found:
|
1144 |
output.append("Other")
|
1145 |
|
1146 |
'''Exp8'''
|
1147 |
-
elif
|
1148 |
-
# rs =
|
1149 |
-
print("E8",rs)
|
1150 |
if "something is wrong with the question" in rs:
|
1151 |
output.append("1")
|
1152 |
else:
|
1153 |
output.append("0")
|
1154 |
|
1155 |
'''Exp9'''
|
1156 |
-
elif
|
1157 |
male, female = 0, 0
|
1158 |
|
1159 |
-
# rs =
|
1160 |
if "because" in rs:
|
1161 |
-
rs = rs.replace("because because","because").split("because")[1]
|
1162 |
else:
|
1163 |
rs = rs
|
1164 |
-
condition =
|
1165 |
rs = rs.split(" ")
|
1166 |
for w in rs:
|
1167 |
if w in male_keyword and female != 1:
|
@@ -1170,18 +772,18 @@ class EvaluationModel:
|
|
1170 |
if w in female_keyword and male != 1:
|
1171 |
female = 1
|
1172 |
break
|
1173 |
-
print("E9", "condition", condition, "male", male, "female", female)
|
1174 |
-
if
|
1175 |
output.append('Other')
|
1176 |
else:
|
1177 |
-
if male == 1 and female==0:
|
1178 |
if condition == "MF":
|
1179 |
output.append("Subject")
|
1180 |
elif condition == "FM":
|
1181 |
output.append("Object")
|
1182 |
else:
|
1183 |
output.append("Other")
|
1184 |
-
elif female == 1 and male ==0:
|
1185 |
if condition == "MF":
|
1186 |
output.append("Object")
|
1187 |
elif condition == "FM":
|
@@ -1190,28 +792,28 @@ class EvaluationModel:
|
|
1190 |
output.append("Other")
|
1191 |
|
1192 |
'''Exp10'''
|
1193 |
-
elif
|
1194 |
-
#
|
1195 |
-
rs = rs.replace(".", "")
|
1196 |
-
|
1197 |
-
|
1198 |
-
# Check if the response contains "yes"
|
1199 |
-
if "yes" in rs:
|
1200 |
output.append("1")
|
1201 |
else:
|
1202 |
output.append("0")
|
1203 |
else:
|
1204 |
-
print("can
|
1205 |
output.append("NA")
|
1206 |
# print(output)
|
1207 |
# exit()
|
1208 |
'''human'''
|
1209 |
-
# self.data = pd.DataFrame(list(zip(
|
1210 |
# columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
|
1211 |
'''LLM'''
|
1212 |
-
|
1213 |
-
|
1214 |
-
|
|
|
|
|
|
|
1215 |
|
1216 |
return self.data
|
1217 |
|
@@ -1332,55 +934,8 @@ class EvaluationModel:
|
|
1332 |
|
1333 |
return all_results
|
1334 |
|
1335 |
-
|
1336 |
-
|
1337 |
-
# # Extract the relevant columns for JS divergence calculation
|
1338 |
-
# human_responses = human_df[['Question_ID', 'Coding']]
|
1339 |
-
# llm_responses = llm_df[['Question_ID', 'Coding']]
|
1340 |
-
#
|
1341 |
-
# # Get unique Question_IDs present in both datasets
|
1342 |
-
# common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
1343 |
-
#
|
1344 |
-
# # Initialize a list to store JS divergence for each Question_ID
|
1345 |
-
# js_divergence_list = []
|
1346 |
-
# js_divergence ={}
|
1347 |
-
#
|
1348 |
-
# # Calculate JS divergence for each common Question_ID
|
1349 |
-
# for q_id in common_question_ids:
|
1350 |
-
# # Get response distributions for the current Question_ID in both datasets
|
1351 |
-
# human_dist = human_responses[human_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
|
1352 |
-
# llm_dist = llm_responses[llm_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
|
1353 |
-
#
|
1354 |
-
# # Reindex the distributions to have the same index, filling missing values with 0
|
1355 |
-
# all_responses = set(human_dist.index).union(set(llm_dist.index))
|
1356 |
-
# human_dist = human_dist.reindex(all_responses, fill_value=0)
|
1357 |
-
# llm_dist = llm_dist.reindex(all_responses, fill_value=0)
|
1358 |
-
#
|
1359 |
-
# # Calculate JS divergence and add to the list
|
1360 |
-
# js_div = jensenshannon(human_dist, llm_dist, base=2)
|
1361 |
-
# experiment_id = q_id.split('_')[1]
|
1362 |
-
# if experiment_id not in js_divergence:
|
1363 |
-
# js_divergence[experiment_id] = []
|
1364 |
-
# js_divergence[experiment_id].append(js_div)
|
1365 |
-
#
|
1366 |
-
# js_divergence_list.append(js_div)
|
1367 |
-
# #js_divergence[q_id] = js_div
|
1368 |
-
#
|
1369 |
-
#
|
1370 |
-
#
|
1371 |
-
# # Calculate the average JS divergence
|
1372 |
-
# # JS per experiment
|
1373 |
-
# avg_js_divergence_per_experiment = {exp: 1- np.nanmean(divs) for exp, divs in js_divergence.items()}
|
1374 |
-
# print(avg_js_divergence_per_experiment)
|
1375 |
-
#
|
1376 |
-
# # JS overall
|
1377 |
-
# avg_js_divergence = 1 - np.nanmean(js_divergence_list)
|
1378 |
-
# print("avg_js_divergence:", avg_js_divergence)
|
1379 |
-
#
|
1380 |
-
# return avg_js_divergence
|
1381 |
-
|
1382 |
-
|
1383 |
-
def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
|
1384 |
'''
|
1385 |
evaluate humanlike score
|
1386 |
1. code the result
|
@@ -1401,7 +956,7 @@ class EvaluationModel:
|
|
1401 |
|
1402 |
'''coding llm data'''
|
1403 |
save_path = result_save_path.replace('.csv','_coding.csv')
|
1404 |
-
self.llm_df = self.code_results_llm(
|
1405 |
|
1406 |
|
1407 |
|
@@ -1412,7 +967,7 @@ class EvaluationModel:
|
|
1412 |
self.llm_df.to_csv(fpath)
|
1413 |
|
1414 |
envs.API.upload_file(
|
1415 |
-
path_or_fileobj=
|
1416 |
path_in_repo=f"{save_path.replace('generation_results/','')}",#
|
1417 |
repo_id=envs.RESULTS_REPO,
|
1418 |
repo_type="dataset",
|
@@ -1426,111 +981,3 @@ class EvaluationModel:
|
|
1426 |
|
1427 |
|
1428 |
|
1429 |
-
|
1430 |
-
|
1431 |
-
|
1432 |
-
|
1433 |
-
|
1434 |
-
|
1435 |
-
|
1436 |
-
|
1437 |
-
|
1438 |
-
|
1439 |
-
|
1440 |
-
|
1441 |
-
|
1442 |
-
|
1443 |
-
|
1444 |
-
def evaluate_hallucination(self, summaries_df):
|
1445 |
-
"""
|
1446 |
-
Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
|
1447 |
-
of the instance with the computed scores.
|
1448 |
-
|
1449 |
-
Args:
|
1450 |
-
summaries_df (DataFrame): DataFrame containing source docs and summaries.
|
1451 |
-
|
1452 |
-
Returns:
|
1453 |
-
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
1454 |
-
"""
|
1455 |
-
hem_scores = []
|
1456 |
-
sources = []
|
1457 |
-
summaries = []
|
1458 |
-
source_summary_pairs = util.create_pairs(summaries_df)
|
1459 |
-
'''评价模型结果'''
|
1460 |
-
for doc, summary in tqdm(source_summary_pairs, desc="Evaluating Humanlikeness"):
|
1461 |
-
if util.is_summary_valid(summary):
|
1462 |
-
try:
|
1463 |
-
summary = summary.replace('<bos>','').replace('<eos>','')
|
1464 |
-
score = self.model.predict([doc, summary])# [0]
|
1465 |
-
if not isinstance(score, float):
|
1466 |
-
try:
|
1467 |
-
score = score.item()
|
1468 |
-
except:
|
1469 |
-
logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
|
1470 |
-
continue
|
1471 |
-
hem_scores.append(score)
|
1472 |
-
sources.append(doc)
|
1473 |
-
summaries.append(summary)
|
1474 |
-
except Exception as e:
|
1475 |
-
logging.error(f"Error while running HEM: {e}")
|
1476 |
-
raise
|
1477 |
-
|
1478 |
-
self.scores = hem_scores
|
1479 |
-
eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
|
1480 |
-
return hem_scores, eval_results
|
1481 |
-
# for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
|
1482 |
-
# if util.is_summary_valid(summary):
|
1483 |
-
# try:
|
1484 |
-
# # summary_pieces = summary.split('\n')
|
1485 |
-
# # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
|
1486 |
-
# summary = summary.replace('<bos>','').replace('<eos>','')
|
1487 |
-
# # print([doc, summary])
|
1488 |
-
# # print(self.model.predict([doc, summary]))
|
1489 |
-
# score = self.model.predict([doc, summary])# [0]
|
1490 |
-
# if not isinstance(score, float):
|
1491 |
-
# try:
|
1492 |
-
# score = score.item()
|
1493 |
-
# except:
|
1494 |
-
# logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
|
1495 |
-
# continue
|
1496 |
-
# hem_scores.append(score)
|
1497 |
-
# sources.append(doc)
|
1498 |
-
# summaries.append(summary)
|
1499 |
-
# except Exception as e:
|
1500 |
-
# logging.error(f"Error while running HEM: {e}")
|
1501 |
-
# raise
|
1502 |
-
|
1503 |
-
# self.scores = hem_scores
|
1504 |
-
# eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
|
1505 |
-
# return hem_scores, eval_results
|
1506 |
-
|
1507 |
-
|
1508 |
-
def compute_factual_consistency_rate(self, threshold=0.5):
|
1509 |
-
"""
|
1510 |
-
Compute the factual consistency rate of the evaluated summaries based on
|
1511 |
-
the previously calculated scores. This method relies on the 'scores'
|
1512 |
-
attribute being populated, typically via the 'evaluate_hallucination' method.
|
1513 |
-
|
1514 |
-
Returns:
|
1515 |
-
float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
|
1516 |
-
and 'hallucination_rate' attributes of the instance.
|
1517 |
-
|
1518 |
-
Raises:
|
1519 |
-
ValueError: If scores have not been calculated prior to calling this method.
|
1520 |
-
"""
|
1521 |
-
if not self.scores:
|
1522 |
-
error_msg = "Scores not calculated. Call evaluate_hallucination() first."
|
1523 |
-
logging.error(error_msg)
|
1524 |
-
raise ValueError(error_msg)
|
1525 |
-
|
1526 |
-
# Use threshold of 0.5 to compute factual_consistency_rate
|
1527 |
-
num_above_threshold = sum(score >= threshold for score in self.scores)
|
1528 |
-
num_total = len(self.scores)
|
1529 |
-
|
1530 |
-
if not num_total:
|
1531 |
-
raise ValueError("No scores available to compute factual consistency rate.")
|
1532 |
-
|
1533 |
-
self.factual_consistency_rate = (num_above_threshold / num_total) * 100
|
1534 |
-
self.hallucination_rate = 100 - self.factual_consistency_rate
|
1535 |
-
|
1536 |
-
return self.factual_consistency_rate
|
|
|
6 |
import requests
|
7 |
import json
|
8 |
|
9 |
+
# import numpy as np
|
10 |
import pandas as pd
|
11 |
import spacy
|
12 |
from sentence_transformers import CrossEncoder
|
|
|
43 |
# Load spacy model for word tokenization
|
44 |
# nlp = spacy.load("en_core_web_sm")
|
45 |
try:
|
46 |
+
nlp1 = spacy.load("en_core_web_trf")
|
47 |
except OSError:
|
48 |
print("无法加载模型,继续执行其他处理。")
|
49 |
|
|
|
55 |
|
56 |
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
class ModelLoadingException(Exception):
|
59 |
"""Exception raised for errors in loading a model.
|
60 |
|
|
|
69 |
super().__init__(f"{messages} id={model_id} revision={revision}")
|
70 |
|
71 |
|
72 |
+
class ResponseGenerator:
|
73 |
+
"""A class to generate responses using a causal language model.
|
74 |
|
75 |
Attributes:
|
76 |
model (str): huggingface/{model_id}
|
77 |
api_base (str): https://api-inference.huggingface.co/models/{model_id}
|
78 |
+
responses_df (DataFrame): DataFrame to store generated responses.
|
79 |
revision (str): Model revision.
|
80 |
+
avg_length (float): Average length of responses.
|
81 |
+
answer_rate (float): Rate of non-empty responses.
|
82 |
"""
|
83 |
|
84 |
def __init__(self, model_id, revision):
|
85 |
"""
|
86 |
+
Initializes the ResponseGenerator with a model.
|
87 |
|
88 |
Args:
|
89 |
model_id (str): Identifier for the model.
|
|
|
92 |
self.model_id = model_id
|
93 |
self.model = f"huggingface/{model_id}"
|
94 |
self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
|
95 |
+
self.responses_df = pd.DataFrame()
|
96 |
self.revision = revision
|
97 |
self.avg_length = None
|
98 |
self.answer_rate = None
|
99 |
self.exceptions = None
|
100 |
self.local_model = None
|
101 |
|
102 |
+
def generate_response(self, dataset, df_prompt, save_path=None):
|
103 |
+
"""Generate responses for a given DataFrame of source docs.
|
|
|
104 |
Args:
|
105 |
dataset (DataFrame): DataFrame containing source docs.
|
106 |
|
107 |
Returns:
|
108 |
+
responses_df (DataFrame): Generated responses by the model.
|
109 |
"""
|
110 |
exceptions = []
|
111 |
if (save_path is not None) and os.path.exists(save_path):
|
112 |
'''已存在文件,可以读取已经存在的测试文本'''
|
113 |
+
self.responses_df = pd.read_csv(save_path)
|
114 |
+
# print(self.responses_df['Experiment'])
|
115 |
|
116 |
+
print(f'Loaded generated responses from {save_path}')
|
117 |
else:
|
118 |
'''测试文件不存在,则需要调用指定的模型来进行测试'''
|
119 |
# prompt = {}
|
|
|
176 |
while True:
|
177 |
try:
|
178 |
'''调用'''
|
179 |
+
print(self.model_id.lower(),'-',ID,'-',j,'-',ii)
|
180 |
|
181 |
+
_response = self.send_request(system_prompt, _user_prompt)
|
182 |
# print(f"Finish index {index}")
|
183 |
break
|
184 |
except Exception as e:
|
|
|
204 |
print(f"Error at index {i}: {e}")
|
205 |
time.sleep(wait_time)
|
206 |
try:
|
207 |
+
_response = self.send_request(system_prompt, _user_prompt)
|
208 |
break
|
209 |
except Exception as ee:
|
210 |
exceptions.append(ee)
|
|
|
219 |
break
|
220 |
if i == 5:
|
221 |
#print(_response)
|
222 |
+
# For E5, the responses might be in the following formats:
|
223 |
+
# "Sure\n\nThe first sentence of the response\n\nThe second sentence of the response"
|
224 |
+
# "The first sentence of the response\n\nThe second sentence of the response"
|
225 |
+
# "XXX: The first sentence of the response\n\nXXX: The second sentence of the response"
|
226 |
+
# "Sure\n\nXXX: The first sentence of the response\n\nXXX: The second sentence of the response"
|
227 |
+
# "Sure\n\nThe first sentence of the response\n\nThe second sentence of the response\n\n"
|
228 |
|
229 |
def extract_responses(text, trigger_words=None):
|
230 |
if trigger_words is None:
|
231 |
trigger_words = ["sure", "okay", "yes"]
|
232 |
|
233 |
try:
|
234 |
+
# Split the text into sentences
|
235 |
sentences = text.split('\n')
|
236 |
+
# Remove empty sentences
|
237 |
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
238 |
+
# Remove the first sentence if it has a : in it,
|
239 |
sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
|
240 |
sentence in sentences]
|
241 |
+
# Remove empty sentences
|
242 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
243 |
+
# Remove the first sentence if it is a trigger word
|
244 |
+
if any(sentences[0].lower().startswith(word) for word in trigger_words) and len(
|
245 |
+
sentences) > 2:
|
246 |
_response1 = sentences[1].strip() if len(sentences) > 1 else None
|
247 |
_response2 = sentences[2].strip() if len(sentences) > 2 else None
|
248 |
else:
|
249 |
_response1 = sentences[0].strip() if len(sentences) > 0 else None
|
250 |
_response2 = sentences[1].strip() if len(sentences) > 1 else None
|
251 |
|
252 |
+
|
253 |
except Exception as e:
|
254 |
print(f"Error occurred: {e}")
|
255 |
_response1, _response2 = None, None
|
256 |
|
257 |
+
print(_response1), print(_response2)
|
258 |
|
259 |
return _response1, _response2
|
260 |
|
261 |
_response1, _response2 = extract_responses(_response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
Experiment_ID.append(ID)
|
264 |
Questions_ID.append(q_column[j])
|
|
|
293 |
Stimuli_1.append(Stimuli_1_column[j])
|
294 |
Item_ID.append(Item_column[j])
|
295 |
Condition.append(Condition_column[j])
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# Sleep to prevent hitting rate limits too frequently
|
298 |
time.sleep(1)
|
299 |
|
300 |
+
self.responses_df = pd.DataFrame(list(zip(Experiment_ID, Questions_ID, Item_ID, Condition, User_prompt, Response, Factor_2, Stimuli_1)),
|
301 |
+
columns=["Experiment", "Question_ID", "Item", "Condition", "User_prompt", "Response","Factor 2","Stimuli 1"])
|
302 |
|
303 |
if save_path is not None:
|
304 |
+
print(f'Save responses to {save_path}')
|
305 |
fpath = Path(save_path)
|
306 |
fpath.parent.mkdir(parents=True, exist_ok=True)
|
307 |
+
self.responses_df.to_csv(fpath)
|
308 |
|
309 |
self.exceptions = exceptions
|
310 |
# self._compute_avg_length()
|
311 |
# self._compute_answer_rate()
|
312 |
|
313 |
+
return self.responses_df
|
314 |
|
315 |
+
def send_request(self, system_prompt: str, user_prompt: str):
|
316 |
# Using Together AI API
|
317 |
using_together_api = False
|
318 |
together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
|
|
|
334 |
"model": self.model_id,
|
335 |
# "max_tokens": 4096,
|
336 |
'max_new_tokens': 100,
|
337 |
+
# "a": 0.0,
|
338 |
# 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
|
339 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
payload['messages'] = [{"role": "system", "content": system_prompt},
|
341 |
{"role": "user", "content": user_prompt}]
|
342 |
headers = {
|
|
|
434 |
continue
|
435 |
|
436 |
raise Exception("All tokens failed.")
|
437 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
elif 'gemini' in self.model_id.lower():
|
439 |
genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
|
440 |
generation_config = {
|
441 |
+
# "temperature": 0,
|
442 |
+
# "top_p": 0.95, # cannot change
|
443 |
+
# "top_k": 0,
|
444 |
"max_output_tokens": 100,
|
445 |
# "response_mime_type": "application/json",
|
446 |
}
|
|
|
492 |
# Using local model
|
493 |
|
494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
|
496 |
|
497 |
class EvaluationModel:
|
498 |
+
"""A class to evaluate generated responses.
|
499 |
|
500 |
Attributes:
|
501 |
model (CrossEncoder): The evaluation model.
|
502 |
+
scores (list): List of scores for the responses.
|
503 |
+
humanlike_score (float): Human-likeness score
|
504 |
+
|
505 |
+
|
506 |
"""
|
507 |
|
508 |
def __init__(self, model_path):
|
509 |
"""
|
510 |
+
Initializes the EvaluationModel.
|
|
|
|
|
|
|
511 |
"""
|
512 |
+
# self.model = load_evaluation_model(model_path)
|
513 |
self.scores = []
|
|
|
|
|
514 |
self.humanlike_score = None
|
515 |
|
516 |
+
def code_results_llm(self, responses_df):
|
517 |
'''code results from LLM's response'''
|
518 |
output = []
|
519 |
'''database for Exp4'''
|
|
|
534 |
Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
|
535 |
Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
|
536 |
|
|
|
537 |
male_keyword = ["he", "his", "himself"]
|
538 |
female_keyword = ["she", "her", "herself"]
|
539 |
+
#print(len(responses_df["Experiment"]))
|
540 |
+
for i in range(len(responses_df["Experiment"])):
|
541 |
+
print(i, "/", len(responses_df["Experiment"]))
|
542 |
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
543 |
# print()
|
544 |
+
if pd.isna(responses_df["Response"][i]):
|
545 |
output.append("Other")
|
546 |
continue
|
547 |
+
rs = responses_df["Response"][i].strip().lower()
|
548 |
+
rs = rs.replace('"', '').replace(" ", " ").replace('.', '')
|
549 |
+
lines = rs.split("\n")
|
550 |
+
filtered_lines = [line for line in lines if line and not (line.endswith(":") or line.endswith(":"))]
|
551 |
+
filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for
|
552 |
+
r in filtered_lines]
|
553 |
+
rs = "\n".join(filtered_lines)
|
554 |
+
rs = rs.strip()
|
555 |
'''Exp1'''
|
556 |
+
if responses_df["Experiment"][i] == "E1":
|
557 |
+
#print("E1", rs)
|
|
|
|
|
558 |
if rs == "round":
|
559 |
# vote_1_1 += 1
|
560 |
output.append("Round")
|
|
|
563 |
else:
|
564 |
output.append("Other")
|
565 |
|
|
|
566 |
'''Exp2'''
|
567 |
+
|
568 |
+
elif responses_df["Experiment"][i] == "E2":
|
569 |
+
# rs = responses_df["Response"][i].strip()
|
570 |
rs = rs.split(' ')
|
571 |
+
#print("E2", rs)
|
572 |
male, female = 0, 0
|
573 |
for word in rs:
|
574 |
if word in female_keyword and male == 0:
|
|
|
579 |
male = 1
|
580 |
output.append("Male")
|
581 |
break
|
582 |
+
if male == 0 and female == 0:
|
583 |
output.append("Other")
|
584 |
|
585 |
'''Exp3'''
|
586 |
+
elif responses_df["Experiment"][i] == "E3":
|
587 |
+
# rs = responses_df["Response"][i].strip()
|
588 |
+
#print("E3", rs)
|
589 |
+
pair = responses_df["Factor 2"][i]
|
590 |
+
word1, word2 = pair.replace(".", "").split('_')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
|
592 |
+
if responses_df["Item"][i] == 12:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
output.append("Other")
|
|
|
594 |
else:
|
595 |
+
words = rs.split() # split the response into words
|
596 |
+
output = []
|
597 |
+
if any(word == word1 for word in words) and any(word == word2 for word in words):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
output.append("Other")
|
599 |
+
else:
|
600 |
+
if any(word.lower() == word1.lower() for word in words):
|
601 |
+
if len(word1) > len(word2):
|
602 |
+
output.append("Long")
|
603 |
+
else:
|
604 |
+
output.append("Short")
|
605 |
+
elif any(word.lower() == word2.lower() for word in words):
|
606 |
+
if len(word1) > len(word2):
|
607 |
+
output.append("Short")
|
608 |
+
else:
|
609 |
+
output.append("Long")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
610 |
else:
|
611 |
output.append("Other")
|
612 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
613 |
'''Exp4'''
|
614 |
|
615 |
+
elif responses_df["Experiment"][i] == "E4":
|
616 |
+
filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split("\n")]
|
617 |
+
filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
618 |
+
rs = "\n".join(filtered_lines)
|
619 |
+
|
620 |
+
filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split(";")]
|
621 |
+
filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
622 |
+
rs = ";".join(filtered_lines).strip()
|
623 |
try:
|
624 |
meaning_word = rs.split(";")[4].replace(" ", '')
|
625 |
except IndexError:
|
626 |
+
try:
|
627 |
+
meaning_word = rs.split("\n")[4].replace(" ", '')
|
628 |
+
except IndexError:
|
629 |
+
output.append("Other")
|
630 |
+
continue
|
631 |
except Exception as e:
|
632 |
print(f"Unexpected error: {e}")
|
633 |
output.append("Other")
|
634 |
continue
|
635 |
+
|
636 |
+
target = responses_df["Factor 2"][i].strip().lower()
|
|
|
637 |
pair = target + "_" + meaning_word
|
638 |
+
#print("E4:", pair)
|
639 |
|
640 |
if pair in wordpair2code.keys():
|
641 |
output.append(wordpair2code[pair])
|
|
|
643 |
output.append("Other")
|
644 |
|
645 |
'''Exp5'''
|
646 |
+
elif responses_df["Experiment"][i] == "E5" or responses_df["Experiment"][i] == "E51":
|
647 |
+
# sentence = responses_df["Response"][i].strip()
|
648 |
+
item_id = responses_df["Item"][i]
|
649 |
+
question_id = responses_df["Question_ID"][i]
|
650 |
|
651 |
sti1, sti2 = "", ""
|
652 |
|
653 |
+
if responses_df["Experiment"][i] == "E51":
|
654 |
sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
|
655 |
+
#sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
|
656 |
verb = item2verb1[item_id].lower()
|
657 |
|
658 |
sentence = sti1 + " " + rs.replace(sti1, "")
|
659 |
+
#print("E5", verb, sentence)
|
660 |
+
if responses_df["Experiment"][i] == "E5":
|
661 |
+
#sti1 = Stimuli1[question_id].lower().replace("...", "")
|
662 |
+
# print(sti1)
|
663 |
sti2 = Stimuli2[question_id].lower().replace("...", "")
|
664 |
|
665 |
verb = item2verb2[item_id].lower()
|
666 |
+
sentence = sti2 + " " + rs.replace(sti2, "")
|
667 |
+
#print("E5", verb, sentence)
|
668 |
|
669 |
+
doc = nlp1(sentence.replace(" ", " "))
|
|
|
670 |
# print(doc)
|
671 |
# print()
|
672 |
verb_token = None
|
|
|
676 |
verb_token = token
|
677 |
break
|
678 |
# exit()
|
679 |
+
pobj, dative = None, None
|
680 |
+
# print(verb_token.children)
|
681 |
+
# exit()
|
682 |
+
if verb_token is not None:
|
|
|
|
|
|
|
683 |
for child in verb_token.children:
|
684 |
+
# print(child)
|
685 |
+
if (child.dep_ == 'dative' and child.pos_ == "ADP") or (
|
686 |
+
child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
|
687 |
pobj = child.text
|
688 |
if child.dep_ == 'dative':
|
689 |
dative = child.text
|
690 |
+
|
691 |
+
# print("E5", pobj, dative)
|
692 |
+
# exit()
|
693 |
+
|
694 |
+
if pobj:
|
695 |
+
output.append("PO")
|
696 |
+
elif dative:
|
697 |
+
output.append("DO")
|
698 |
+
else:
|
699 |
+
# print("Other", sentence, pobj, dative)
|
700 |
# exit()
|
701 |
+
output.append("Other")
|
702 |
+
|
703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
704 |
|
705 |
'''Exp6'''
|
706 |
|
707 |
+
elif responses_df["Experiment"][i] == "E6":
|
708 |
+
sentence = responses_df["Stimuli 1"][i].strip().lower()
|
709 |
+
#print("E6", sentence)
|
710 |
doc = nlp1(sentence)
|
711 |
subject = "None"
|
712 |
obj = "None"
|
713 |
+
|
714 |
+
|
715 |
for token in doc:
|
716 |
if token.dep_ == "nsubj":
|
717 |
subject = token.text
|
718 |
elif token.dep_ == "dobj":
|
719 |
obj = token.text
|
720 |
+
#print("E6", subject, obj)
|
721 |
if subject in rs and obj in rs:
|
722 |
+
#print(rs, subject, obj, "Other")
|
723 |
output.append("Other")
|
724 |
elif subject in rs:
|
725 |
+
#print(rs, subject, obj, "VP")
|
726 |
output.append("VP")
|
727 |
elif obj in rs:
|
728 |
+
#print(rs, subject, obj, "NP")
|
729 |
output.append("NP")
|
730 |
else:
|
731 |
+
#print(rs, subject, obj, "Other")
|
732 |
output.append("Other")
|
733 |
|
|
|
|
|
|
|
734 |
'''Exp7'''
|
735 |
+
elif responses_df["Experiment"][i] == "E7":
|
736 |
+
# rs = responses_df["Response"][i].strip().lower()
|
737 |
rs = rs.replace(".", "").replace(",", "").lower()
|
738 |
+
#print("E7", rs)
|
739 |
+
if "yes" in rs and "no" in rs:
|
740 |
+
output.append("Other")
|
741 |
+
elif "no" in rs:
|
742 |
+
output.append("0")
|
743 |
+
elif "yes" in rs:
|
744 |
+
output.append("1")
|
745 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
output.append("Other")
|
747 |
|
748 |
'''Exp8'''
|
749 |
+
elif responses_df["Experiment"][i] == "E8":
|
750 |
+
# rs = responses_df["Response"][i].strip()
|
751 |
+
#print("E8", rs)
|
752 |
if "something is wrong with the question" in rs:
|
753 |
output.append("1")
|
754 |
else:
|
755 |
output.append("0")
|
756 |
|
757 |
'''Exp9'''
|
758 |
+
elif responses_df["Experiment"][i] == "E9":
|
759 |
male, female = 0, 0
|
760 |
|
761 |
+
# rs = responses_df["Response"][i].strip()
|
762 |
if "because" in rs:
|
763 |
+
rs = rs.replace("because because", "because").split("because")[1]
|
764 |
else:
|
765 |
rs = rs
|
766 |
+
condition = responses_df["Factor 2"][i].strip()
|
767 |
rs = rs.split(" ")
|
768 |
for w in rs:
|
769 |
if w in male_keyword and female != 1:
|
|
|
772 |
if w in female_keyword and male != 1:
|
773 |
female = 1
|
774 |
break
|
775 |
+
#print("E9", "condition", condition, "male", male, "female", female)
|
776 |
+
if male == 0 and female == 0:
|
777 |
output.append('Other')
|
778 |
else:
|
779 |
+
if male == 1 and female == 0:
|
780 |
if condition == "MF":
|
781 |
output.append("Subject")
|
782 |
elif condition == "FM":
|
783 |
output.append("Object")
|
784 |
else:
|
785 |
output.append("Other")
|
786 |
+
elif female == 1 and male == 0:
|
787 |
if condition == "MF":
|
788 |
output.append("Object")
|
789 |
elif condition == "FM":
|
|
|
792 |
output.append("Other")
|
793 |
|
794 |
'''Exp10'''
|
795 |
+
elif responses_df["Experiment"][i] == "E10":
|
796 |
+
# rs = responses_df["Response"][i].strip()
|
797 |
+
rs = rs.replace(".", "")
|
798 |
+
if rs == "yes":
|
|
|
|
|
|
|
799 |
output.append("1")
|
800 |
else:
|
801 |
output.append("0")
|
802 |
else:
|
803 |
+
#print("can;t find the Exp:", responses_df["Experiment"][i])
|
804 |
output.append("NA")
|
805 |
# print(output)
|
806 |
# exit()
|
807 |
'''human'''
|
808 |
+
# self.data = pd.DataFrame(list(zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"], responses_df["Factor 2"], responses_df["Stimuli 1"], responses_df["Coding"], output)),
|
809 |
# columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
|
810 |
'''LLM'''
|
811 |
+
# print(len(output))
|
812 |
+
self.data = pd.DataFrame(list(
|
813 |
+
zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],
|
814 |
+
responses_df["Factor 2"], responses_df["Stimuli 1"], output)),
|
815 |
+
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Stimuli 1",
|
816 |
+
"Coding"])
|
817 |
|
818 |
return self.data
|
819 |
|
|
|
934 |
|
935 |
return all_results
|
936 |
|
937 |
+
|
938 |
+
def evaluate_humanlike(self, responses_df: pd.DataFrame, human_data_path: object, result_save_path: str) -> object:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
939 |
'''
|
940 |
evaluate humanlike score
|
941 |
1. code the result
|
|
|
956 |
|
957 |
'''coding llm data'''
|
958 |
save_path = result_save_path.replace('.csv','_coding.csv')
|
959 |
+
self.llm_df = self.code_results_llm(responses_df)
|
960 |
|
961 |
|
962 |
|
|
|
967 |
self.llm_df.to_csv(fpath)
|
968 |
|
969 |
envs.API.upload_file(
|
970 |
+
path_or_fileobj=save_path,#./generation_results/meta-llama/Llama-2-13b-chat-hf_coding.csv
|
971 |
path_in_repo=f"{save_path.replace('generation_results/','')}",#
|
972 |
repo_id=envs.RESULTS_REPO,
|
973 |
repo_type="dataset",
|
|
|
981 |
|
982 |
|
983 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/util.py
CHANGED
@@ -1,81 +1,3 @@
|
|
1 |
-
def is_summary_valid(summary: str) -> bool:
|
2 |
-
"""
|
3 |
-
Checks if the summary is valid.
|
4 |
-
|
5 |
-
A summary is valid if it is not empty and contains at least five words.
|
6 |
-
|
7 |
-
Args:
|
8 |
-
summary (str): The summary to check.
|
9 |
-
|
10 |
-
Returns:
|
11 |
-
bool: True if the summary is valid, False otherwise.
|
12 |
-
"""
|
13 |
-
if isinstance(summary, str):
|
14 |
-
words = summary.split()
|
15 |
-
if len(words) >= 5:
|
16 |
-
return True
|
17 |
-
# print(summary)
|
18 |
-
return False
|
19 |
-
|
20 |
-
|
21 |
-
def create_pairs(df):
|
22 |
-
"""
|
23 |
-
Creates pairs of source and summary from the dataframe.
|
24 |
-
|
25 |
-
Args:
|
26 |
-
df (DataFrame): The dataframe containing source and summary columns.
|
27 |
-
|
28 |
-
Returns:
|
29 |
-
list: A list of pairs [source, summary].
|
30 |
-
"""
|
31 |
-
pairs = []
|
32 |
-
for _, row in df.iterrows():
|
33 |
-
pairs.append([row['source'], row['summary']])
|
34 |
-
|
35 |
-
return pairs
|
36 |
-
|
37 |
-
|
38 |
-
# def format_results(model_name: str, revision: str, precision: str,
|
39 |
-
# factual_consistency_rate: float, hallucination_rate: float,
|
40 |
-
# answer_rate: float, avg_summary_len: float) -> dict:
|
41 |
-
# """
|
42 |
-
# Formats the evaluation results into a structured dictionary.
|
43 |
-
#
|
44 |
-
# Args:
|
45 |
-
# model_name (str): The name of the evaluated model.
|
46 |
-
# revision (str): The revision hash of the model.
|
47 |
-
# precision (str): The precision with which the evaluation was run.
|
48 |
-
# factual_consistency_rate (float): The factual consistency rate.
|
49 |
-
# hallucination_rate (float): The hallucination rate.
|
50 |
-
# answer_rate (float): The answer rate.
|
51 |
-
# avg_summary_len (float): The average summary length.
|
52 |
-
#
|
53 |
-
# Returns:
|
54 |
-
# dict: A dictionary containing the structured evaluation results.
|
55 |
-
# """
|
56 |
-
# results = {
|
57 |
-
# "config": {
|
58 |
-
# "model_dtype": precision, # Precision with which you ran the evaluation
|
59 |
-
# "model_name": model_name, # Name of the model
|
60 |
-
# "model_sha": revision # Hash of the model
|
61 |
-
# },
|
62 |
-
# "results": {
|
63 |
-
# "hallucination_rate": {
|
64 |
-
# "hallucination_rate": round(hallucination_rate,3)
|
65 |
-
# },
|
66 |
-
# "factual_consistency_rate": {
|
67 |
-
# "factual_consistency_rate": round(factual_consistency_rate,1)
|
68 |
-
# },
|
69 |
-
# "answer_rate": {
|
70 |
-
# "answer_rate": round(answer_rate*100,1)
|
71 |
-
# },
|
72 |
-
# "average_summary_length": {
|
73 |
-
# "average_summary_length": round(avg_summary_len,1)
|
74 |
-
# },
|
75 |
-
# }
|
76 |
-
# }
|
77 |
-
#
|
78 |
-
# return results
|
79 |
|
80 |
def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
|
81 |
"""
|
@@ -97,7 +19,7 @@ def format_results(model_name: str, revision: str, precision: str, overall_js: f
|
|
97 |
"config": {
|
98 |
"model_dtype": precision, # Precision with which you ran the evaluation
|
99 |
"model_name": model_name, # Name of the model
|
100 |
-
"model_sha": revision # Hash of the model
|
101 |
},
|
102 |
"results": {
|
103 |
"overall_js_divergence": overall_js, # Overall JS divergence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
|
3 |
"""
|
|
|
19 |
"config": {
|
20 |
"model_dtype": precision, # Precision with which you ran the evaluation
|
21 |
"model_name": model_name, # Name of the model
|
22 |
+
#"model_sha": revision # Hash of the model
|
23 |
},
|
24 |
"results": {
|
25 |
"overall_js_divergence": overall_js, # Overall JS divergence
|
src/display/about.py
CHANGED
@@ -33,10 +33,6 @@ class Tasks(Enum):
|
|
33 |
E10 = Task("E10", "E10", "E10 Humanlike %")
|
34 |
E10_ci = Task("E10_ci", "E10_ci", "E10 CI")
|
35 |
|
36 |
-
# factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
|
37 |
-
# answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
|
38 |
-
# average_summary_length = Task("average_summary_length",
|
39 |
-
# "average_summary_length", "Average Summary Length")
|
40 |
|
41 |
|
42 |
# Your leaderboard name
|
@@ -59,18 +55,6 @@ To quantify the similarity, we collected responses from 2000 human participants,
|
|
59 |
To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
|
60 |
The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
|
61 |
|
62 |
-
## Evaluation Dataset
|
63 |
-
|
64 |
-
Our evaluation dataset consists of 1006 documents from multiple public datasets, primarily [CNN/Daily Mail Corpus](https://huggingface.co/datasets/cnn_dailymail/viewer/1.0.0/test).
|
65 |
-
We generate summaries for each of these documents using submitted LLMs and compute hallucination scores for each pair of document and generated summary. (Check the prompt we used [here](https://huggingface.co/spaces/vectara/Hallucination-evaluation-leaderboard))
|
66 |
-
|
67 |
-
## Metrics Explained
|
68 |
-
- Individual Task Similarity: For each psycholinguistic task, we calculate the humanlike score for each stimulus, providing a measure of how closely the model’s responses resemble those of humans.
|
69 |
-
- Average Similarity: The average of the humanlike scores across all stimuli and tasks, giving an overall indication of the model’s performance in mimicking human language use.
|
70 |
-
|
71 |
-
## Note on non-Hugging Face models
|
72 |
-
On HHEM leaderboard, There are currently models such as GPT variants that are not available on the Hugging Face model hub. We ran the evaluations for these models on our own and uploaded the results to the leaderboard.
|
73 |
-
If you would like to submit your model that is not available on the Hugging Face model hub, please contact us at [email protected].
|
74 |
|
75 |
## Model Submissions and Reproducibility
|
76 |
You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
|
@@ -101,27 +85,26 @@ After the evaluation, results are saved in "eval-results-bk/your_model_id/result
|
|
101 |
## Results Format
|
102 |
The results are structured in JSON as follows:
|
103 |
```python
|
104 |
-
{
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
"
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
122 |
}
|
123 |
-
}
|
124 |
-
}
|
125 |
```
|
126 |
For additional queries or model submissions, please contact [email protected].
|
127 |
"""
|
|
|
33 |
E10 = Task("E10", "E10", "E10 Humanlike %")
|
34 |
E10_ci = Task("E10_ci", "E10_ci", "E10 CI")
|
35 |
|
|
|
|
|
|
|
|
|
36 |
|
37 |
|
38 |
# Your leaderboard name
|
|
|
55 |
To measure the similarity between human and model responses, we utilize the Jensen-Shannon (JS) divergence. This method allows us to compare the two binomial distributions (one from human responses and one from model responses) for each stimulus.
|
56 |
The similarity is quantified by calculating 1 minus the JS divergence, where a value closer to 1 indicates higher similarity.
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
## Model Submissions and Reproducibility
|
60 |
You can submit your model for evaluation, whether it's hosted on the Hugging Face model hub or not. (Though it is recommended to host your model on the Hugging Face)
|
|
|
85 |
## Results Format
|
86 |
The results are structured in JSON as follows:
|
87 |
```python
|
88 |
+
{
|
89 |
+
"config": {
|
90 |
+
"model_dtype": "BF16",
|
91 |
+
"model_name": "mistralai/Mistral-7B-Instruct-v0.1",
|
92 |
+
"model_sha": ""
|
93 |
+
},
|
94 |
+
"results": {
|
95 |
+
"overall_js_divergence": 0.6129438385008659,
|
96 |
+
"overall_confidence_interval": [
|
97 |
+
0.5937234777290732,
|
98 |
+
0.6317188731175192
|
99 |
+
],
|
100 |
+
"E9": 0.7768461816966632,
|
101 |
+
"E9_ci": [
|
102 |
+
0.7474754730701578,
|
103 |
+
0.8058680968641126
|
104 |
+
],
|
105 |
+
...
|
106 |
+
}
|
107 |
}
|
|
|
|
|
108 |
```
|
109 |
For additional queries or model submissions, please contact [email protected].
|
110 |
"""
|
src/display/utils.py
CHANGED
@@ -45,7 +45,7 @@ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub Licen
|
|
45 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
46 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
47 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
48 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
49 |
# Dummy column for the search bar (hidden by the custom CSS)
|
50 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
51 |
|
|
|
45 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
46 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
47 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
48 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
49 |
# Dummy column for the search bar (hidden by the custom CSS)
|
50 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
51 |
|
src/envs.py
CHANGED
@@ -53,7 +53,6 @@ API = HfApi(token=TOKEN)
|
|
53 |
|
54 |
DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
|
55 |
PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
|
56 |
-
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
57 |
HUMAN_DATA = "./src/datasets/human_data_coding.csv" #experiment data
|
58 |
ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
|
59 |
ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
|
@@ -61,5 +60,4 @@ ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
|
|
61 |
# SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
|
62 |
SYSTEM_PROMPT = "You are participating in a psycholinguistic experiment. You will complete a task on English language use. Please respond to the questions directly, without using introductory phrases (e.g., Sure or OK) or special formats at the beginning of your responses."
|
63 |
'''prompt'''
|
64 |
-
# USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
|
65 |
USER_PROMPT = ""
|
|
|
53 |
|
54 |
DATASET_PATH = "./src/datasets/Material_Llama2_0603.xlsx" #experiment data
|
55 |
PROMPT_PATH = "./src/datasets/prompt.xlsx" #prompt for each experiment
|
|
|
56 |
HUMAN_DATA = "./src/datasets/human_data_coding.csv" #experiment data
|
57 |
ITEM_4_DATA = "./src/datasets/associataion_dataset.csv" #database
|
58 |
ITEM_5_DATA = "./src/datasets/Items_5.csv" #experiment 5 need verb words
|
|
|
60 |
# SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
|
61 |
SYSTEM_PROMPT = "You are participating in a psycholinguistic experiment. You will complete a task on English language use. Please respond to the questions directly, without using introductory phrases (e.g., Sure or OK) or special formats at the beginning of your responses."
|
62 |
'''prompt'''
|
|
|
63 |
USER_PROMPT = ""
|
src/leaderboard/read_evals.py
CHANGED
@@ -85,16 +85,15 @@ class EvalResult:
|
|
85 |
if isinstance(v, (int, float)) and not math.isnan(v):
|
86 |
accs.append(np.around(v * 100, decimals=0))
|
87 |
elif isinstance(v, list):
|
88 |
-
# 处理列表中的每个数值
|
89 |
accs.extend([np.around(x * 100, decimals=0) for x in v if
|
90 |
isinstance(x, (int, float)) and not math.isnan(x)])
|
91 |
else:
|
92 |
# 跳过 NaN 或不符合条件的值
|
93 |
accs.append(None)
|
94 |
|
95 |
-
|
96 |
accs = np.array([x for x in accs if x is not None])
|
97 |
-
|
98 |
accs = accs[accs != None]
|
99 |
|
100 |
results[task.benchmark] = accs
|
@@ -168,7 +167,7 @@ class EvalResult:
|
|
168 |
utils.AutoEvalColumn.architecture.name: self.architecture,
|
169 |
utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
|
170 |
utils.AutoEvalColumn.dummy.name: self.full_model,
|
171 |
-
utils.AutoEvalColumn.revision.name: self.revision,
|
172 |
utils.AutoEvalColumn.license.name: self.license,
|
173 |
utils.AutoEvalColumn.likes.name: self.likes,
|
174 |
utils.AutoEvalColumn.params.name: self.num_params,
|
|
|
85 |
if isinstance(v, (int, float)) and not math.isnan(v):
|
86 |
accs.append(np.around(v * 100, decimals=0))
|
87 |
elif isinstance(v, list):
|
|
|
88 |
accs.extend([np.around(x * 100, decimals=0) for x in v if
|
89 |
isinstance(x, (int, float)) and not math.isnan(x)])
|
90 |
else:
|
91 |
# 跳过 NaN 或不符合条件的值
|
92 |
accs.append(None)
|
93 |
|
94 |
+
|
95 |
accs = np.array([x for x in accs if x is not None])
|
96 |
+
|
97 |
accs = accs[accs != None]
|
98 |
|
99 |
results[task.benchmark] = accs
|
|
|
167 |
utils.AutoEvalColumn.architecture.name: self.architecture,
|
168 |
utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
|
169 |
utils.AutoEvalColumn.dummy.name: self.full_model,
|
170 |
+
# utils.AutoEvalColumn.revision.name: self.revision,
|
171 |
utils.AutoEvalColumn.license.name: self.license,
|
172 |
utils.AutoEvalColumn.likes.name: self.likes,
|
173 |
utils.AutoEvalColumn.params.name: self.num_params,
|
src/populate.py
CHANGED
@@ -19,7 +19,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
19 |
print("all results:",df.to_string())
|
20 |
# exit()
|
21 |
try:
|
22 |
-
df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
|
23 |
df = df[cols].round(decimals=2)
|
24 |
# filter out if any of the benchmarks have not been produced
|
25 |
df = df[formatting.has_no_nan_values(df, benchmark_cols)]
|
|
|
19 |
print("all results:",df.to_string())
|
20 |
# exit()
|
21 |
try:
|
|
|
22 |
df = df[cols].round(decimals=2)
|
23 |
# filter out if any of the benchmarks have not been produced
|
24 |
df = df[formatting.has_no_nan_values(df, benchmark_cols)]
|