Spaces:
Running
Running
File size: 6,944 Bytes
650c5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import shutil
import os, sys
from subprocess import check_call, check_output
import glob
import argparse
import shutil
import pathlib
import itertools
def call_output(cmd):
print(f"Executing: {cmd}")
ret = check_output(cmd, shell=True)
print(ret)
return ret
def call(cmd):
print(cmd)
check_call(cmd, shell=True)
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
sys.exit(-1)
SPM_PATH = os.environ.get('SPM_PATH', None)
if SPM_PATH is None or not SPM_PATH.strip():
print("Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting...")
sys.exit(-1)
SPM_MODEL = f'{WORKDIR_ROOT}/sentence.bpe.model'
SPM_VOCAB = f'{WORKDIR_ROOT}/dict_250k.txt'
SPM_ENCODE = f'{SPM_PATH}'
if not os.path.exists(SPM_MODEL):
call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/sentence.bpe.model -O {SPM_MODEL}")
if not os.path.exists(SPM_VOCAB):
call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/dict_250k.txt -O {SPM_VOCAB}")
def get_data_size(raw):
cmd = f'wc -l {raw}'
ret = call_output(cmd)
return int(ret.split()[0])
def encode_spm(model, direction, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None):
src, tgt = direction.split('-')
for split in splits:
src_raw, tgt_raw = f'{RAW_DIR}/{split}{prefix}.{direction}.{src}', f'{RAW_DIR}/{split}{prefix}.{direction}.{tgt}'
if os.path.exists(src_raw) and os.path.exists(tgt_raw):
cmd = f"""python {SPM_ENCODE} \
--model {model}\
--output_format=piece \
--inputs {src_raw} {tgt_raw} \
--outputs {BPE_DIR}/{direction}{prefix}/{split}.bpe.{src} {BPE_DIR}/{direction}{prefix}/{split}.bpe.{tgt} """
print(cmd)
call(cmd)
def binarize_(
bpe_dir,
databin_dir,
direction, spm_vocab=SPM_VOCAB,
splits=['train', 'test', 'valid'],
):
src, tgt = direction.split('-')
try:
shutil.rmtree(f'{databin_dir}', ignore_errors=True)
os.mkdir(f'{databin_dir}')
except OSError as error:
print(error)
cmds = [
"fairseq-preprocess",
f"--source-lang {src} --target-lang {tgt}",
f"--destdir {databin_dir}/",
f"--workers 8",
]
if isinstance(spm_vocab, tuple):
src_vocab, tgt_vocab = spm_vocab
cmds.extend(
[
f"--srcdict {src_vocab}",
f"--tgtdict {tgt_vocab}",
]
)
else:
cmds.extend(
[
f"--joined-dictionary",
f"--srcdict {spm_vocab}",
]
)
input_options = []
if 'train' in splits and glob.glob(f"{bpe_dir}/train.bpe*"):
input_options.append(
f"--trainpref {bpe_dir}/train.bpe",
)
if 'valid' in splits and glob.glob(f"{bpe_dir}/valid.bpe*"):
input_options.append(f"--validpref {bpe_dir}/valid.bpe")
if 'test' in splits and glob.glob(f"{bpe_dir}/test.bpe*"):
input_options.append(f"--testpref {bpe_dir}/test.bpe")
if len(input_options) > 0:
cmd = " ".join(cmds + input_options)
print(cmd)
call(cmd)
def binarize(
databin_dir,
direction, spm_vocab=SPM_VOCAB, prefix='',
splits=['train', 'test', 'valid'],
pairs_per_shard=None,
):
def move_databin_files(from_folder, to_folder):
for bin_file in glob.glob(f"{from_folder}/*.bin") \
+ glob.glob(f"{from_folder}/*.idx") \
+ glob.glob(f"{from_folder}/dict*"):
try:
shutil.move(bin_file, to_folder)
except OSError as error:
print(error)
bpe_databin_dir = f"{BPE_DIR}/{direction}{prefix}_databin"
bpe_dir = f"{BPE_DIR}/{direction}{prefix}"
if pairs_per_shard is None:
binarize_(bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=splits)
move_databin_files(bpe_databin_dir, databin_dir)
else:
# binarize valid and test which will not be sharded
binarize_(
bpe_dir, bpe_databin_dir, direction,
spm_vocab=spm_vocab, splits=[s for s in splits if s != "train"])
for shard_bpe_dir in glob.glob(f"{bpe_dir}/shard*"):
path_strs = os.path.split(shard_bpe_dir)
shard_str = path_strs[-1]
shard_folder = f"{bpe_databin_dir}/{shard_str}"
databin_shard_folder = f"{databin_dir}/{shard_str}"
print(f'working from {shard_folder} to {databin_shard_folder}')
os.makedirs(databin_shard_folder, exist_ok=True)
binarize_(
shard_bpe_dir, shard_folder, direction,
spm_vocab=spm_vocab, splits=["train"])
for test_data in glob.glob(f"{bpe_databin_dir}/valid.*") + glob.glob(f"{bpe_databin_dir}/test.*"):
filename = os.path.split(test_data)[-1]
try:
os.symlink(test_data, f"{databin_shard_folder}/{filename}")
except OSError as error:
print(error)
move_databin_files(shard_folder, databin_shard_folder)
def load_langs(path):
with open(path) as fr:
langs = [l.strip() for l in fr]
return langs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--data_root", default=f"{WORKDIR_ROOT}/ML50")
parser.add_argument("--raw-folder", default='raw')
parser.add_argument("--bpe-folder", default='bpe')
parser.add_argument("--databin-folder", default='databin')
args = parser.parse_args()
DATA_PATH = args.data_root #'/private/home/yuqtang/public_data/ML50'
RAW_DIR = f'{DATA_PATH}/{args.raw_folder}'
BPE_DIR = f'{DATA_PATH}/{args.bpe_folder}'
DATABIN_DIR = f'{DATA_PATH}/{args.databin_folder}'
os.makedirs(BPE_DIR, exist_ok=True)
raw_files = itertools.chain(
glob.glob(f'{RAW_DIR}/train*'),
glob.glob(f'{RAW_DIR}/valid*'),
glob.glob(f'{RAW_DIR}/test*'),
)
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
for direction in directions:
prefix = ""
splits = ['train', 'valid', 'test']
try:
shutil.rmtree(f'{BPE_DIR}/{direction}{prefix}', ignore_errors=True)
os.mkdir(f'{BPE_DIR}/{direction}{prefix}')
os.makedirs(DATABIN_DIR, exist_ok=True)
except OSError as error:
print(error)
spm_model, spm_vocab = SPM_MODEL, SPM_VOCAB
encode_spm(spm_model, direction=direction, splits=splits)
binarize(DATABIN_DIR, direction, spm_vocab=spm_vocab, splits=splits)
|