File size: 4,706 Bytes
9f1d0ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#!/usr/bin/env bash
# nohup sh run.sh --stage 0 --stop_stage 1 --system_version centos &
# sh run.sh --stage 0 --stop_stage 1 --system_version windows
# sh run.sh --stage 0 --stop_stage 0 --system_version centos
# sh run.sh --stage 2 --stop_stage 2 --system_version centos --checkpoint_name final
# sh run.sh --stage -1 --stop_stage 1
# bitsandbytes
export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
# params
system_version="windows";
verbose=true;
stage=0 # start from 0 if you need to start from data preparation
stop_stage=5
pretrained_model_name=gpt2-chinese-cluecorpussmall
train_subset=train.jsonl
valid_subset=valid.jsonl
final_model_name=gpt2_chinese_h_novel
checkpoint_name=final
# parse options
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
--*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
old_value="(eval echo \\$$name)";
if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval "${name}=\"$2\"";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
$verbose && echo "system_version: ${system_version}"
work_dir="$(pwd)"
file_dir="$(pwd)/file_dir"
pretrained_models_dir="${work_dir}/../../../pretrained_models";
serialization_dir="${file_dir}/serialization_dir"
final_model_dir="${work_dir}/../../../trained_models/${final_model_name}";
mkdir -p "${file_dir}"
mkdir -p "${pretrained_models_dir}"
mkdir -p "${serialization_dir}"
mkdir -p "${final_model_dir}"
export PYTHONPATH="${work_dir}/../../.."
if [ $system_version == "windows" ]; then
alias python3='C:/Users/tianx/PycharmProjects/virtualenv/Transformers/Scripts/python.exe'
elif [ $system_version == "centos" ]; then
# conda activate Transformers
alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
elif [ $system_version == "ubuntu" ]; then
# conda activate Transformers
alias python3='/usr/local/miniconda3/envs/Transformers/bin/python3'
fi
declare -A pretrained_model_dict
pretrained_model_dict=(
["gpt2-chinese-cluecorpussmall"]="https://huggingface.co/uer/gpt2-chinese-cluecorpussmall"
["gpt2"]="https://huggingface.co/gpt2"
["japanese-gpt2-medium"]="https://huggingface.co/rinna/japanese-gpt2-medium"
)
pretrained_model_dir="${pretrained_models_dir}/${pretrained_model_name}"
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
$verbose && echo "stage -1: download pretrained model"
cd "${file_dir}" || exit 1;
if [ ! -d "${pretrained_model_dir}" ]; then
cd "${pretrained_models_dir}" || exit 1;
repository_url="${pretrained_model_dict[${pretrained_model_name}]}"
git clone "${repository_url}"
cd "${pretrained_model_dir}" || exit 1;
rm flax_model.msgpack && rm pytorch_model.bin && rm tf_model.h5
wget "${repository_url}/resolve/main/pytorch_model.bin"
fi
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
$verbose && echo "stage 0: prepare data"
cd "${work_dir}" || exit 1;
python3 1.prepare_data.py \
--train_subset "${file_dir}/${train_subset}" \
--valid_subset "${file_dir}/${valid_subset}" \
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
$verbose && echo "stage 1: train model"
cd "${work_dir}" || exit 1;
python3 2.train_model.py \
--train_subset "${file_dir}/${train_subset}" \
--valid_subset "${file_dir}/${valid_subset}" \
--pretrained_model_name_or_path "${pretrained_models_dir}/${pretrained_model_name}" \
--output_dir "${serialization_dir}"
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
$verbose && echo "stage 2: collect files"
cd "${work_dir}" || exit 1;
cp "${serialization_dir}/${checkpoint_name}/pytorch_model.bin" "${final_model_dir}/pytorch_model.bin"
cp "${pretrained_models_dir}/${pretrained_model_name}/config.json" "${final_model_dir}/config.json"
cp "${pretrained_models_dir}/${pretrained_model_name}/special_tokens_map.json" "${final_model_dir}/special_tokens_map.json"
cp "${pretrained_models_dir}/${pretrained_model_name}/tokenizer_config.json" "${final_model_dir}/tokenizer_config.json"
cp "${pretrained_models_dir}/${pretrained_model_name}/vocab.txt" "${final_model_dir}/vocab.txt"
fi
|