diff --git a/gpt-fin/training/hostfile.txt b/gpt-fin/training/hostfile.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f78118c9b0c47f9034e1a77da95182a08d816b8 --- /dev/null +++ b/gpt-fin/training/hostfile.txt @@ -0,0 +1,4 @@ +g1102 slots=4 +g4102 slots=4 +g6101 slots=4 +g6102 slots=4 diff --git a/gpt-fin/training/node_init.sh b/gpt-fin/training/node_init.sh new file mode 100755 index 0000000000000000000000000000000000000000..ecffc841b1fbe0093eeff83a689f5127dee9602c --- /dev/null +++ b/gpt-fin/training/node_init.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Initialize compute node environment. + +# When using deepspeed and pdsh, source this script in +# deepspeed/launcher/multinode_runner.py before calling +# deepspeed.launcher.launch. +CSC_ENV_INIT='/appl/profile/zz-csc-env.sh' + +if [ -f "$CSC_ENV_INIT" ]; then + echo "$0: sourcing $CSC_ENV_INIT" >&2 + source "$CSC_ENV_INIT" +else + echo "$0: no $CSC_ENV_INIT, exiting" + exit 1 +fi + +module purge +export SING_IMAGE=/scratch/project_2004600/containers/latest.sif diff --git a/gpt-fin/training/singularity_trainer.bash b/gpt-fin/training/singularity_trainer.bash deleted file mode 100644 index 219eff61a0253c6a0a08589c704b7d8fe08681b5..0000000000000000000000000000000000000000 --- a/gpt-fin/training/singularity_trainer.bash +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=large_ -#SBATCH --account=project_2004600 -##SBATCH --time=72:00:00 -#SBATCH --time=00:15:00 -#SBATCH --partition=gputest -#SBATCH --nodes=2 -#SBATCH --mem=200G -#SBATCH --cpus-per-task=40 -#SBATCH --gres=gpu:v100:4,nvme:200 -#SBATCH -o logs/%j.out -#SBATCH -e logs/%j.err - -rm -f logs/latest.out logs/latest.err - -ln -s $SLURM_JOBID.out logs/latest.out -ln -s $SLURM_JOBID.err logs/latest.err - -export TORCH_EXTENSIONS_DIR=/projappl/project_2004600/risto/testing/singularity_trainer/ds_env/torch_ext/ - -GPUS=$(echo $SLURM_JOB_GPUS | tr -s ', ' '[\n*]' | wc -l) -module load pytorch -module load gcc/9.1.0 -module load cuda/11.1.0 -module load pdsh/2.34 - -### CREATES HOSTFILE ### - -rm -f hostfile.txt -while IFS=',' read -ra ADDR; do - for i in "${ADDR[@]}"; do - echo $i slots=$GPUS >>hostfile.txt - done -done <<< $SLURM_JOB_NODELIST - -### /CREATES HOSTFILE ### - -cat hostfile.txt - -export SINGULARITYENV_APPEND_PATH="/users/rluukkon/.local/bin" -export CPATH=/appl/spack/install-tree/gcc-9.1.0/python-3.6.8-ecovls/include/python3.6m:$CPATH - -#export TMPDIR=/scratch/project_2004600/risto/ -export HF_DATASETS_CACHE=$TMPDIR/"dataset_cache/" - -echo "Using TEMP-dir " $HF_DATASETS_CACHE -#export PATH=$HOME/local/bin:$HOME/.local/bin:$PATH - -echo $PATH -#export PATH="$HOME/local/bin:$PATH" -export LD_LIBRARY_PATH=$HOME/.local/lib/python3.8/site-packages/:$LD_LIBRARY_PATH - -#export HF_DATASETS_IN_MEMORY_MAX_SIZE=0 -#echo "Copying data to local storage" -#cp -r /scratch/project_2004600/risto/batched-data $TMPDIR -#cp -r /scratch/project_2004600/FinBERT-data/gpt2-eval $TMPDIR -#echo "done" - - -#MODEL_OUT=/scratch/project_2004600/risto/model_out_TEST/ -MODEL_OUT=/scratch/project_2004600/risto/gpt2_large_all_data/ -#MODEL_CHECKPOINT=/scratch/project_2004600/risto/model_long_2_1x4/ -MODEL_CHECKPOINT='no' -#BATCH_SIZE=16 # for medium if blocks are 500 -BATCH_SIZE=16 # blocks are 250 -EPOCHS=5 -DS_CONFIG=/projappl/project_2004600/risto/model3multi/training/ds_config.json -#DS_CONFIG=/projappl/project_2004600/risto/testing/singularity_trainer/zero_stage_3.json -TRAIN_SAMPLE_SIZE=10 -GRAD_ACC=1 -MODEL='gpt2' -#TRAIN_DATA=$TMPDIR/batched-data/ -#EVAL_DATA=$TMPDIR/gpt2-eval/ -#EVAL_DATA=/scratch/project_2004600/FinBERT-data/gpt2-eval/ -#TRAIN_DATA=/scratch/project_2004600/risto/batched-data/ -TRAIN_DATA=/scratch/project_2004600/FinBERT-data/train/ -EVAL_DATA=/scratch/project_2004600/FinBERT-data/eval/ - - -echo "start running trainer script" -srun deepspeed_singularity --hostfile=hostfile.txt /projappl/project_2004600/risto/model3multi/training/trainer.py \ - --train_data $TRAIN_DATA \ - --train_sample_size $TRAIN_SAMPLE_SIZE \ - --eval_data $EVAL_DATA \ - --model_output_dir $MODEL_OUT \ - --from_checkpoint $MODEL_CHECKPOINT \ - --batch_size $BATCH_SIZE \ - --epochs $EPOCHS \ - --grad_acc $GRAD_ACC \ - --model $MODEL \ - --cache_dir $HF_DATASETS_CACHE \ - --deepspeed --deepspeed_config $DS_CONFIG - -seff $SLURM_JOBId diff --git a/gpt-fin/training/trainer.bash b/gpt-fin/training/trainer.bash index 4bb9a2840757482f2bed2177a3baf4f3364ec837..06ccebbb8a56966156f1e1844bebc49a3b02e4cf 100644 --- a/gpt-fin/training/trainer.bash +++ b/gpt-fin/training/trainer.bash @@ -29,14 +29,15 @@ export TMPDIR=/scratch/project_2004600/risto export HF_DATASETS_CACHE=$TMPDIR/"dataset_cache/" echo "Using TEMP-dir " $HF_DATASETS_CACHE export SING_IMAGE=/scratch/project_2004600/containers/latest.sif -export SING_FLAGS="$SING_FLAGS -B /appl/spack/v014/install-tree/gcc-4.8.5/pdsh-2.31-cdzt5w/bin/:/usr/local/sbin,"$pwd"/node_init.sh:/data/ --nv" +export SING_FLAGS="$SING_FLAGS -B /appl/spack/v014/install-tree/gcc-4.8.5/pdsh-2.31-cdzt5w/bin/:/usr/local/sbin,$(pwd)/node_init.sh:/data/ --nv" +echo $SING_FLAGS #export NCCL_DEBUG=INFO MODEL_OUTPUTDIR=/scratch/project_2004600/experiments/270921-mahti-4x4-gptsmall-epochs5-finbert10-1024/ echo "start running trainer script" -singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER_NODE $pwd/trainer.py \ +singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER_NODE $(pwd)/trainer.py \ --train_data /scratch/project_2004600/FinBERT-data/batched-data-bal/train/ \ --eval_data /scratch/project_2004600/FinBERT-data/batched-data-bal/eval/ \ - --lr 1e-4 \ + --lr 5e-5 \ --block_size 1024 \ --tokenizer /projappl/project_2004600/risto/tokenizer_openai/ \ --train_sample_size 2 \ @@ -47,7 +48,7 @@ singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER --grad_acc 1 \ --model 'gpt2' \ --cache_dir $HF_DATASETS_CACHE \ - --deepspeed --deepspeed_config $pwd + --deepspeed --deepspeed_config $(pwd)/ds_config.json seff $SLURM_JOBID seff $SLURM_JOBID > $MODEL_OUTPUTDIR/seff.txt diff --git a/gpt-fin/training/trainer.py b/gpt-fin/training/trainer.py index d15a0b729e52d6b6e3a2500af1a643c1b28b3061..80ac8bc05c2cad44321a5376a68e18c38b8be051 100644 --- a/gpt-fin/training/trainer.py +++ b/gpt-fin/training/trainer.py @@ -17,7 +17,7 @@ from transformers import ( logging, WEIGHTS_NAME, CONFIG_NAME, - GPT2Tokenizer, + GPT2TokenizerFast, GPT2LMHeadModel, GPT2Config, Trainer, @@ -77,6 +77,13 @@ if __name__ == '__main__': help='number of training epochs') parser.add_argument('--model', type=str, help='model configuration name') + parser.add_argument('--tokenizer', type=str, + help='tokenizer path') + parser.add_argument('--lr', type=float, + help='learning rate') + parser.add_argument('--block_size',type=int, + help='data block size') + parser.add_argument('--grad_acc', type=int, help='number of gradient accumulation steps') @@ -94,13 +101,13 @@ if __name__ == '__main__': num_workers = 40 - block_size = 300 + block_size = cmd_args.block_size overwrite_cache=False keep_in_memory=True - logging.set_verbosity_debug() - tokenizer_path = '/projappl/project_2004600/risto/tokenizer/' - + #logging.set_verbosity_debug() + tokenizer_path = cmd_args.tokenizer #'/projappl/project_2004600/risto/tokenizer/' + LEARNING_RATE = cmd_args.lr TRAIN_FILE_PATH = cmd_args.train_data TRAIN_SAMPLE_SIZE = cmd_args.train_sample_size EVAL_FILE_PATH = cmd_args.eval_data @@ -130,25 +137,9 @@ if __name__ == '__main__': print("Using checkpoint dir",last_checkpoint) print(f'Load tokenizer from path: {tokenizer_path}') - tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) + tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path) print('done') - tokenizer.add_special_tokens({ - "eos_token": "</s>", - "bos_token": "<s>", - "unk_token": "<unk>", - "pad_token": "<pad>", - "mask_token": "<mask>" - }) - tokenizer.add_prefix_space=True - - - -# config=GPT2Config( -# name_or_path=MODEL, -# vocab_size=tokenizer.vocab_size, -# bos_token_id=tokenizer.bos_token_id, -# eos_token_id=tokenizer.eos_token_id config = AutoConfig.from_pretrained(MODEL) # Doesn't load weights config.vocab_size=tokenizer.vocab_size @@ -171,6 +162,7 @@ if __name__ == '__main__': if TRAIN_SAMPLE_SIZE != -1: np.random.seed(1992) train_paths = np.random.choice(train_paths,TRAIN_SAMPLE_SIZE,replace=False).tolist() + print("files:",train_paths) eval_dir = EVAL_FILE_PATH eval_paths = [str(x) for x in Path(eval_dir).glob("**/*.txt")] @@ -179,7 +171,7 @@ if __name__ == '__main__': print(f"Eval files count: {len(eval_paths)}, evaluation files: ", eval_paths) start_time = perf_counter() dataset = load_dataset( -# cache_dir = cmd_args.cache_dir, + cache_dir = cmd_args.cache_dir, path = 'text', # path to loading script. 'text' is default data_files = {'train': train_paths, 'eval': eval_paths}, # keep_in_memory=keep_in_memory @@ -249,14 +241,16 @@ if __name__ == '__main__': num_train_epochs=EPOCHS, # number of training epochs per_device_train_batch_size=BATCH_SIZE, # batch size for training per_device_eval_batch_size=BATCH_SIZE, # batch size for evaluation + log_level = 'warning', evaluation_strategy='steps', save_strategy='steps', - save_steps=10000, -# learning_rate=0.001, produces na early on + save_steps=5000, + learning_rate=LEARNING_RATE, # set lr in ds_config.json gradient_accumulation_steps=GRAD_ACC_STEPS, - eval_steps = 10000, # Number of update steps between two evaluations. + eval_steps = 5000, # Number of update steps between two evaluations. + weight_decay = 0.01, - warmup_steps= 100,# number of warmup steps for learning rate scheduler + warmup_steps= 500,# number of warmup steps for learning rate scheduler fp16=True, #whether to use floating point 16 for training TESTAA TODO deepspeed=DS_CONF, )