TwT-6
/

api-demo

Model card Files Files and versions Community

File size: 2,893 Bytes

256a159

import json

from datasets import Dataset

from opencompass.registry import LOAD_DATASET

from .base import BaseDataset


@LOAD_DATASET.register_module()
class C3Dataset(BaseDataset):

    @staticmethod
    def load(path: str):

        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        rows = []
        for _, row in enumerate(data):
            content = row[0]
            content_str = ' '.join(
                [''.join(paragraph) for paragraph in content])

            for question in row[1]:
                label = question['choice'].index(question['answer'])
                length = len(question['choice'])
                if length < 4:
                    fill_value = question['choice'][0]  # 以第一个值为填充值
                    fill_count = 4 - length  # 需要填充的数量
                    question['choice'] += [fill_value] * fill_count  # 填充

                rows.append({
                    'content': content_str,
                    'question': question['question'],
                    'choices': question['choice'],
                    'choice0': question['choice'][0],
                    'choice1': question['choice'][1],
                    'choice2': question['choice'][2],
                    'choice3': question['choice'][3],
                    'label': label
                })

        dataset = Dataset.from_dict({
            'content': [row['content'] for row in rows],
            'question': [row['question'] for row in rows],
            'choice0': [row['choice0'] for row in rows],
            'choice1': [row['choice1'] for row in rows],
            'choice2': [row['choice2'] for row in rows],
            'choice3': [row['choice3'] for row in rows],
            'choices': [row['choices'] for row in rows],
            'label': [row['label'] for row in rows]
        })
        return dataset


@LOAD_DATASET.register_module()
class C3Dataset_V2(BaseDataset):

    @staticmethod
    def load(path: str):
        with open(path, 'r', encoding='utf-8') as f:
            raw = json.load(f)
        data = []
        for line in raw:
            content = ''.join([''.join(paragraph) for paragraph in line[0]])
            for question in line[1]:
                label = question['choice'].index(question['answer'])
                label = 'ABCD'[label]
                while len(question['choice']) < 4:
                    question['choice'].append('[NULL]')
                data.append({
                    'content': content,
                    'question': question['question'],
                    'choice0': question['choice'][0],
                    'choice1': question['choice'][1],
                    'choice2': question['choice'][2],
                    'choice3': question['choice'][3],
                    'label': label
                })
        return Dataset.from_list(data)