Spaces:
Running
Running
# Copyright (c) Facebook, Inc. and its affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
# set -x -e | |
if [ -z $WORKDIR_ROOT ] ; | |
then | |
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." | |
exit | |
fi | |
# put intermediate files | |
TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2 | |
# output {train,valid,test} files to dest | |
DEST=${WORKDIR_ROOT}/ML50/raw | |
ROOT=${WORKDIR_ROOT} | |
UTILS=$PWD/utils | |
TMX2CORPUS="${UTILS}/tmx2corpus" | |
TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py" | |
mkdir -p $TMP_DIR | |
mkdir -p $DEST | |
mkdir -p $UTILS | |
function download_opus(){ | |
src=$1 | |
tgt=$2 | |
subset=$3 | |
ulr=$4 | |
mkdir extract_$subset.$src-$tgt | |
pushd extract_$subset.$src-$tgt | |
if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then | |
wget $url -O "$subset.$src-$tgt.tmx.gz" | |
gzip -d "$subset.$src-$tgt.tmx.gz" | |
f=$subset.$src-$tgt.tmx | |
$TMX_TOOL $f | |
mv bitext.$src ../$subset.$src-$tgt.$src | |
mv bitext.$tgt ../$subset.$src-$tgt.$tgt | |
fi | |
popd | |
} | |
function concat_subsets(){ | |
src=$1 | |
tgt=$2 | |
subsets=$3 | |
src_train=raw_train.$src-$tgt.$src | |
tgt_train=raw_train.$src-$tgt.$tgt | |
> $src_train | |
> $tgt_train | |
for subset in $subsets; do | |
cat $subset.$src-$tgt.$src >> $src_train | |
cat $subset.$src-$tgt.$tgt >> $tgt_train | |
done | |
} | |
function get_seeded_random() | |
{ | |
seed="$1" | |
openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ | |
</dev/zero 2>/dev/null | |
} | |
function split_train_valid(){ | |
src=$1 | |
tgt=$2 | |
raw_src_train=raw_train.$src-$tgt.$src | |
raw_tgt_train=raw_train.$src-$tgt.$tgt | |
shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src | |
shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt | |
head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src | |
head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt | |
tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src | |
tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt | |
} | |
function copy2dst(){ | |
lsrc=$1 | |
ltgt=$2 | |
src=${lsrc:0:2} | |
tgt=${ltgt:0:2} | |
cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc | |
cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt | |
cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc | |
cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt | |
} | |
#for xh-en | |
declare -A xh_en_urls | |
xh_en_urls=( | |
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz | |
[wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz | |
[memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz | |
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz | |
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz | |
[XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz | |
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz | |
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz | |
) | |
mkdir $TMP_DIR/xh-en | |
pushd $TMP_DIR/xh-en | |
for k in "${!xh_en_urls[@]}" | |
do | |
name=$k | |
url=${xh_en_urls[$k]} | |
echo "$name: $url" | |
download_opus xh en $name $ulr | |
done | |
concat_subsets xh en "${!xh_en_urls[@]}" | |
split_train_valid xh en | |
copy2dst xh_ZA en_XX | |
popd | |
## | |
#for af-en | |
declare -A af_en_urls | |
af_en_urls=( | |
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz | |
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz | |
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz | |
[QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz | |
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz | |
[OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz | |
[SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz | |
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz | |
) | |
mkdir $TMP_DIR/af-en | |
pushd $TMP_DIR/af-en | |
for k in "${!af_en_urls[@]}" | |
do | |
name=$k | |
url=${af_en_urls[$k]} | |
echo "$name: $url" | |
download_opus af en $name $ulr | |
done | |
concat_subsets af en "${!af_en_urls[@]}" | |
split_train_valid af en | |
copy2dst af_ZA en_XX | |
popd | |