tokenizer-arena / stats /sample.py
xu-song's picture
add more tokenizer
0bba168
raw
history blame
879 Bytes
import os
import glob
import json
def extract():
for file_path in glob.glob("compression_rate_all/*.json"):
file_name = os.path.basename(file_path)
print(file_name)
data = json.load(open(file_path, "r", encoding="utf-8"))
with open(os.path.join("compression_rate", file_name), "w", encoding="utf-8") as f_out:
f_out.write(json.dumps(data[:10], ensure_ascii=False, indent=2))
# def continue_extract():
# for file_path in glob.glob("compression_rate/*.json"):
# file_name = os.path.basename(file_path)
#
# data = json.load(open(file_path, "r", encoding="utf-8"))
# if len(data) > 10:
# print(file_name, len(data))
# #
# # with open(file_name, "w", encoding="utf-8") as f_out:
# # f_out.write(json.dumps(data[:10], ensure_ascii=False, indent=2))
#
extract()