import copy import json input_path = "20B_tokenizer_chinese.json" tokenizer = json.load(open(input_path, "r", encoding="utf-8")) vocab = tokenizer["model"]["vocab"] for k, v in copy.deepcopy(vocab).items(): vocab[str(v)] = v vocab.pop(k) out_path = input_path.replace(".json", ".mock.json") with open(out_path, "w", encoding="utf-8") as f_out: f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))