Spaces:

fclong
/

summary

Runtime error

summary / fengshen /tokenizer /sentencepiece /pretrain_google_sp.sh

Upload 396 files

8ebda9e about 2 years ago

1.23 kB

	#!/bin/bash
	#SBATCH --job-name=google_sp
	#SBATCH --nodes=1
	#SBATCH --cpus-per-task=100
	#SBATCH --ntasks-per-node=1
	#SBATCH -o %x-%j.log

	set -x -e

	echo "START TIME: $(date)"

	BIN_PATH=/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/bin/spm_train
	export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/cognitive_comp/gaoxinyu/sentencepiece/sentencepiece/bin/usr/local/lib
	INPUT_FILE=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_59132213.txt
	INPUT_FILE_SMALL=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/tokenizer/sentencepiece/shuffle_corpus_1000000.txt


	VOCAB_SIZE=40000
	COV=0.9995
	MAX_LENGTH=6
	TYPE=bpe
	SEED=42
	MAX_INPUT_LENGTH=100000

	OPTION="\
	--input=${INPUT_FILE} \
	--vocab_size=${VOCAB_SIZE} \
	--character_coverage=${COV} \
	--max_sentencepiece_length=${MAX_LENGTH} \
	--model_type=${TYPE} \
	--model_prefix=${TYPE}_v${VOCAB_SIZE}_s${SEED}_cov${COV}_max${MAX_LENGTH} \
	--random_seed=${SEED} \
	--max_sentence_length=100000 \
	--shuffle_input_sentence=true \
	--input_sentence_size=${MAX_INPUT_LENGTH} \
	--minloglevel 1 \
	--num_threads=100 \
	--train_extremely_large_corpus=true \
	"

	eval $BIN_PATH $OPTION