|
import json |
|
|
|
from datasets import Dataset, load_dataset |
|
|
|
from opencompass.registry import LOAD_DATASET |
|
|
|
from .base import BaseDataset |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class CluewscDataset(BaseDataset): |
|
|
|
@staticmethod |
|
def load(**kwargs): |
|
|
|
dataset = load_dataset(**kwargs) |
|
|
|
def preprocess(example): |
|
text_list = list(example['text']) |
|
|
|
|
|
text_list[example['target'] |
|
['span2_index']] = example['target']['span1_text'] |
|
example['new_text'] = ''.join(text_list) |
|
if example['label'] == 'true': |
|
example['answer'] = 1 |
|
else: |
|
example['answer'] = 0 |
|
example['span1'] = example['target']['span1_text'] |
|
example['span2'] = example['target']['span2_text'] |
|
del example['target'] |
|
return example |
|
|
|
dataset = dataset.map(preprocess) |
|
return dataset |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class CluewscDataset_V2(BaseDataset): |
|
|
|
@staticmethod |
|
def load(path): |
|
data = [] |
|
with open(path, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = json.loads(line) |
|
item = { |
|
'span1': line['target']['span1_text'], |
|
'span2': line['target']['span2_text'], |
|
'text': line['text'], |
|
'label': { |
|
'true': 'A', |
|
'false': 'B' |
|
}[line['label']], |
|
} |
|
data.append(item) |
|
return Dataset.from_list(data) |
|
|