ir_chinese_medqa / utility /rankings /split_by_offset.py
欧卫
'add_app_files'
58627fa
raw
history blame
1.33 kB
"""
Split the ranked lists after retrieval with a merged query set.
"""
import os
import random
from argparse import ArgumentParser
def main(args):
output_paths = ['{}.{}'.format(args.ranking, split) for split in args.names]
assert all(not os.path.exists(path) for path in output_paths), output_paths
output_files = [open(path, 'w') for path in output_paths]
with open(args.ranking) as f:
for line in f:
qid, pid, rank, *other = line.strip().split('\t')
qid = int(qid)
split_output_path = output_files[qid // args.gap - 1]
qid = qid % args.gap
split_output_path.write('\t'.join([str(x) for x in [qid, pid, rank, *other]]) + '\n')
print(f.name)
_ = [f.close() for f in output_files]
print("#> Done!")
if __name__ == "__main__":
random.seed(12345)
parser = ArgumentParser(description='Subsample the dev set.')
parser.add_argument('--ranking', dest='ranking', required=True)
parser.add_argument('--names', dest='names', required=False, default=['train', 'dev', 'test'], type=str, nargs='+') # order matters!
parser.add_argument('--gap', dest='gap', required=False, default=1_000_000_000, type=int) # larger than any individual query set
args = parser.parse_args()
main(args)