|
import json |
|
|
|
from datasets import Dataset, load_dataset |
|
|
|
from opencompass.registry import LOAD_DATASET |
|
|
|
from .base import BaseDataset |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class CHIDDataset(BaseDataset): |
|
|
|
@staticmethod |
|
def load(**kwargs): |
|
|
|
dataset = load_dataset(**kwargs) |
|
|
|
def preprocess(example): |
|
content = example['content'] |
|
for i, c in enumerate(example['candidates']): |
|
example[f'content{i}'] = content.replace('#idiom#', c) |
|
return example |
|
|
|
dataset = dataset.map(preprocess) |
|
return dataset |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class CHIDDataset_V2(BaseDataset): |
|
|
|
@staticmethod |
|
def load(path): |
|
data = [] |
|
with open(path, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = json.loads(line) |
|
item = {} |
|
item['content'] = line['content'].replace('#idiom#', '______') |
|
for i, c in enumerate(line['candidates']): |
|
item[chr(ord('A') + i)] = c |
|
item['answer'] = 'ABCDEFG'[line['answer']] |
|
data.append(item) |
|
return Dataset.from_list(data) |
|
|