File size: 1,080 Bytes
30e605a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import argparse
import jsonlines
import json
from tqdm import tqdm
import uuid
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, default="flan1m-alpaca-uncensored.jsonl")
parser.add_argument("--out-file", type=str, default="flan1m-sharegpt-deduped.json")
args = parser.parse_args()
in_file = args.in_file
out_file = args.out_file
f = open(out_file, "w", encoding="utf-8")
questions = {}
out = []
with jsonlines.open(in_file) as reader:
for obj in tqdm(reader):
if questions.get(obj["instruction"] + obj["input"]) is None:
questions[obj["instruction"] + obj["input"]] = True
out.append(
{
"id": f"{uuid.uuid4()}",
"bot": "dolphin",
"training": obj["instruction"],
"conversations": [
{"from": "human", "value": obj["input"]},
{"from": "gpt", "value": obj["output"]},
],
}
)
json.dump(out, f, ensure_ascii=False)
f.close()
|