File size: 12,415 Bytes
256a159 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
# flake8: noqa
import json
import random
from datasets import Dataset
from opencompass.datasets.base import BaseDataset
from opencompass.registry import LOAD_DATASET
@LOAD_DATASET.register_module()
class NeedleBenchATCDataset(BaseDataset):
@staticmethod
def load(
path,
num_needles: int,
language: str,
repeats: int,
):
data = {'prompt': [], 'answer': []}
with open(path, 'r', encoding='utf-8') as file:
names_data = json.load(file)
all_names = names_data[language].split(',')
for _ in range(repeats):
names = random.sample(all_names, num_needles)
if language == 'Chinese':
relationship_terms = [
'父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆'
]
relationship_templates = [
'{A}是{B}的{relationship}。',
'{B}的{relationship}是{A}。',
'{A}作为{B}的{relationship},对{B}的成长有重要影响。',
'{A}不仅是{B}的{relationship},还是{B}的榜样。',
'{B}是{A}所生的孩子。',
'{A}对{B}来说,不只是一个{relationship},还是一个朋友。',
'{A}在{B}的生命中扮演着{relationship}的角色。',
'{B}把{A}视为其{relationship}。',
]
elif language == 'English':
relationship_terms = [
'father', 'mother', 'dad', 'mom', 'grandfather',
'grandmother', 'maternal grandmother',
'maternal grandfather', 'paternal grandfather',
'paternal grandmother'
]
relationship_templates = [
"{A} is {B}'s {relationship}.",
"{B}'s {relationship} is {A}.",
("{A}, as {B}'s {relationship}, "
"has a significant impact on {B}'s upbringing."),
("{A} is not only {B}'s {relationship} "
"but also {B}'s role model."),
'{B} is the child of {A}.',
('For {B}, {A} is not just a {relationship}, '
'but also a friend.'),
("{A} plays the role of {B}'s {relationship} "
"in {B}'s life."),
'{B} considers {A} as their {relationship}.',
]
def generate_chain_family_story(names, templates,
relationship_terms):
story = ''
for i in range(len(names) - 1):
template = random.choice(templates)
relation_term = random.choice(relationship_terms)
relation = template.format(A=names[i],
B=names[i + 1],
relationship=relation_term)
story += f'{relation}*'
return story
chain_story = generate_chain_family_story(names,
relationship_templates,
relationship_terms)
# Splitting the chain_story into a list of fragments
family_story_fragments = chain_story.split('*')
# Shuffling the list of fragments
random.shuffle(family_story_fragments)
# Joining the shuffled fragments back into a string
shuffled_story = ''.join(family_story_fragments)
last_person = names[-1]
# Generating the prompt based on the language
if language == 'Chinese':
prompt = (f"""
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?
例如:
例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。
例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。
例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。
""")
elif language == 'English':
prompt = (f"""
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
For example:
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
)
else:
prompt = 'Language not supported.'
raise Exception('Unsupported language specified. '
"Please choose either 'Chinese' or 'English'.")
# Combine story and prompt
shuffled_story_with_prompt = shuffled_story + ' ' + prompt
data['prompt'].append(shuffled_story_with_prompt)
data['answer'].append(names[0] + '*' + names[0])
dataset = Dataset.from_dict({
'prompt': data['prompt'],
'answer': data['answer'],
})
return dataset
@LOAD_DATASET.register_module()
class NeedleBenchATCOrderedDataset(BaseDataset):
@staticmethod
def load(
path,
num_needles: int,
language: str,
repeats: int,
):
data = {'prompt': [], 'answer': []}
with open(path, 'r', encoding='utf-8') as file:
names_data = json.load(file)
all_names = names_data[language].split(',')
for _ in range(repeats):
names = random.sample(all_names, num_needles)
if language == 'Chinese':
relationship_terms = [
'父亲', '母亲', '爸爸', '妈妈', '爷爷', '奶奶', '姥姥', '姥爷', '外公', '外婆'
]
relationship_templates = [
'{A}是{B}的{relationship}。',
'{B}的{relationship}是{A}。',
'{A}作为{B}的{relationship},对{B}的成长有重要影响。',
'{A}不仅是{B}的{relationship},还是{B}的榜样。',
'{B}是{A}所生的孩子。',
'{A}对{B}来说,不只是一个{relationship},还是一个朋友。',
'{A}在{B}的生命中扮演着{relationship}的角色。',
'{B}把{A}视为其{relationship}。',
]
elif language == 'English':
relationship_terms = [
'father', 'mother', 'dad', 'mom', 'grandfather',
'grandmother', 'maternal grandmother',
'maternal grandfather', 'paternal grandfather',
'paternal grandmother'
]
relationship_templates = [
"{A} is {B}'s {relationship}.",
"{B}'s {relationship} is {A}.",
("{A}, as {B}'s {relationship}, "
"has a significant impact on {B}'s upbringing."),
("{A} is not only {B}'s {relationship} "
"but also {B}'s role model."),
'{B} is the child of {A}.',
('For {B}, {A} is not just a {relationship}, '
'but also a friend.'),
("{A} plays the role of {B}'s {relationship} "
"in {B}'s life."),
'{B} considers {A} as their {relationship}.',
]
def generate_chain_family_story(names, templates,
relationship_terms):
story = ''
for i in range(len(names) - 1):
template = random.choice(templates)
relation_term = random.choice(relationship_terms)
relation = template.format(A=names[i],
B=names[i + 1],
relationship=relation_term)
story += f'{relation}*'
return story
chain_story = generate_chain_family_story(names,
relationship_templates,
relationship_terms)
# Splitting the chain_story into a list of fragments
family_story_fragments = chain_story.split('*')
# Joining the shuffled fragments back into a string
shuffled_story = ''.join(family_story_fragments)
last_person = names[-1]
# Generating the prompt based on the language
if language == 'Chinese':
prompt = (f"""
在上面提供的打乱的家族关系文本中,'{last_person}'的能够向上追溯到的最年长的亲人是谁?
例如:
例子1.如果张强的父亲是马克,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中张强能够向上追溯到的最年长的亲人就是马克。
例子2.如果李明的姥姥是张红,而张红的父亲是张强,除此以外提供的文本中没有更多关于亲属关系的信息,那么在提供的文本中李明能够向上追溯到的最年长的亲人就是张强。
例子3.如果小明是张红的曾孙女,张红的祖母是王华,王华的父亲是王刚,除此以外提供的文本中没有更多关于亲属关系的信息,那么小明能够向上追溯到的最年长的亲人就是王刚。
""")
elif language == 'English':
prompt = (f"""
Given the scrambled family relationships described above, who is the eldest relative that '{last_person}' can trace back to in the context?
For example:
Example 1: If Zhang Qiang's father is Mark, and no further information about familial relationships is provided in the text, then the oldest relative Zhang Qiang can trace back to in the provided text is Mark.
Example 2: If Li Ming's grandmother is Zhang Hong, and Zhang Hong's father is Zhang Qiang, and no further information about familial relationships is provided in the text, then the oldest relative Li Ming can trace back to in the provided text is Zhang Qiang.
Example 3: If Xiao Ming is Zhang Hong's great-granddaughter, Zhang Hong's grandmother is Wang Hua, and Wang Hua's father is Wang Gang, and no further information about familial relationships is provided in the text, then the oldest relative Xiao Ming can trace back to in the provided text is Wang Gang."""
)
else:
prompt = 'Language not supported.'
raise Exception('Unsupported language specified. '
"Please choose either 'Chinese' or 'English'.")
# Combine story and prompt
shuffled_story_with_prompt = shuffled_story + ' ' + prompt
data['prompt'].append(shuffled_story_with_prompt)
data['answer'].append(names[0] + '*' + names[0])
dataset = Dataset.from_dict({
'prompt': data['prompt'],
'answer': data['answer'],
})
return dataset
|