|
import json |
|
|
|
from datasets import Dataset, DatasetDict |
|
|
|
from .base import BaseDataset |
|
|
|
|
|
class CommonsenseQADataset_CN(BaseDataset): |
|
|
|
@staticmethod |
|
def load(path): |
|
datasetdict = DatasetDict() |
|
for split in ['train', 'validation']: |
|
data = [] |
|
with open(path, 'r') as f: |
|
for line in f: |
|
item = json.loads(line) |
|
data.append(item) |
|
|
|
def pre_process(example): |
|
for i in range(5): |
|
example[chr(ord('A') + i)] = example['choices']['text'][i] |
|
return example |
|
|
|
dataset = Dataset.from_list(data) |
|
dataset = dataset.map(pre_process).remove_columns( |
|
['question_concept', 'id', 'choices']) |
|
datasetdict[split] = dataset |
|
|
|
return datasetdict |
|
|