|
import json |
|
|
|
from datasets import Dataset |
|
|
|
from opencompass.registry import LOAD_DATASET |
|
|
|
from .base import BaseDataset |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class C3Dataset(BaseDataset): |
|
|
|
@staticmethod |
|
def load(path: str): |
|
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
rows = [] |
|
for _, row in enumerate(data): |
|
content = row[0] |
|
content_str = ' '.join( |
|
[''.join(paragraph) for paragraph in content]) |
|
|
|
for question in row[1]: |
|
label = question['choice'].index(question['answer']) |
|
length = len(question['choice']) |
|
if length < 4: |
|
fill_value = question['choice'][0] |
|
fill_count = 4 - length |
|
question['choice'] += [fill_value] * fill_count |
|
|
|
rows.append({ |
|
'content': content_str, |
|
'question': question['question'], |
|
'choices': question['choice'], |
|
'choice0': question['choice'][0], |
|
'choice1': question['choice'][1], |
|
'choice2': question['choice'][2], |
|
'choice3': question['choice'][3], |
|
'label': label |
|
}) |
|
|
|
dataset = Dataset.from_dict({ |
|
'content': [row['content'] for row in rows], |
|
'question': [row['question'] for row in rows], |
|
'choice0': [row['choice0'] for row in rows], |
|
'choice1': [row['choice1'] for row in rows], |
|
'choice2': [row['choice2'] for row in rows], |
|
'choice3': [row['choice3'] for row in rows], |
|
'choices': [row['choices'] for row in rows], |
|
'label': [row['label'] for row in rows] |
|
}) |
|
return dataset |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class C3Dataset_V2(BaseDataset): |
|
|
|
@staticmethod |
|
def load(path: str): |
|
with open(path, 'r', encoding='utf-8') as f: |
|
raw = json.load(f) |
|
data = [] |
|
for line in raw: |
|
content = ''.join([''.join(paragraph) for paragraph in line[0]]) |
|
for question in line[1]: |
|
label = question['choice'].index(question['answer']) |
|
label = 'ABCD'[label] |
|
while len(question['choice']) < 4: |
|
question['choice'].append('[NULL]') |
|
data.append({ |
|
'content': content, |
|
'question': question['question'], |
|
'choice0': question['choice'][0], |
|
'choice1': question['choice'][1], |
|
'choice2': question['choice'][2], |
|
'choice3': question['choice'][3], |
|
'label': label |
|
}) |
|
return Dataset.from_list(data) |
|
|