File size: 2,893 Bytes
256a159 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class C3Dataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
rows = []
for _, row in enumerate(data):
content = row[0]
content_str = ' '.join(
[''.join(paragraph) for paragraph in content])
for question in row[1]:
label = question['choice'].index(question['answer'])
length = len(question['choice'])
if length < 4:
fill_value = question['choice'][0] # 以第一个值为填充值
fill_count = 4 - length # 需要填充的数量
question['choice'] += [fill_value] * fill_count # 填充
rows.append({
'content': content_str,
'question': question['question'],
'choices': question['choice'],
'choice0': question['choice'][0],
'choice1': question['choice'][1],
'choice2': question['choice'][2],
'choice3': question['choice'][3],
'label': label
})
dataset = Dataset.from_dict({
'content': [row['content'] for row in rows],
'question': [row['question'] for row in rows],
'choice0': [row['choice0'] for row in rows],
'choice1': [row['choice1'] for row in rows],
'choice2': [row['choice2'] for row in rows],
'choice3': [row['choice3'] for row in rows],
'choices': [row['choices'] for row in rows],
'label': [row['label'] for row in rows]
})
return dataset
@LOAD_DATASET.register_module()
class C3Dataset_V2(BaseDataset):
@staticmethod
def load(path: str):
with open(path, 'r', encoding='utf-8') as f:
raw = json.load(f)
data = []
for line in raw:
content = ''.join([''.join(paragraph) for paragraph in line[0]])
for question in line[1]:
label = question['choice'].index(question['answer'])
label = 'ABCD'[label]
while len(question['choice']) < 4:
question['choice'].append('[NULL]')
data.append({
'content': content,
'question': question['question'],
'choice0': question['choice'][0],
'choice1': question['choice'][1],
'choice2': question['choice'][2],
'choice3': question['choice'][3],
'label': label
})
return Dataset.from_list(data)
|