diff --git a/model3multi/training/trainer.py b/model3multi/training/trainer.py index 7714cb5b614113f59b17c3b5562b8c331b93e32b..80ac8bc05c2cad44321a5376a68e18c38b8be051 100644 --- a/model3multi/training/trainer.py +++ b/model3multi/training/trainer.py @@ -17,7 +17,7 @@ from transformers import ( logging, WEIGHTS_NAME, CONFIG_NAME, - GPT2Tokenizer, + GPT2TokenizerFast, GPT2LMHeadModel, GPT2Config, Trainer, @@ -79,7 +79,11 @@ if __name__ == '__main__': help='model configuration name') parser.add_argument('--tokenizer', type=str, help='tokenizer path') - + parser.add_argument('--lr', type=float, + help='learning rate') + parser.add_argument('--block_size',type=int, + help='data block size') + parser.add_argument('--grad_acc', type=int, help='number of gradient accumulation steps') @@ -97,13 +101,13 @@ if __name__ == '__main__': num_workers = 40 - block_size = 1024 + block_size = cmd_args.block_size overwrite_cache=False keep_in_memory=True - logging.set_verbosity_debug() + #logging.set_verbosity_debug() tokenizer_path = cmd_args.tokenizer #'/projappl/project_2004600/risto/tokenizer/' - + LEARNING_RATE = cmd_args.lr TRAIN_FILE_PATH = cmd_args.train_data TRAIN_SAMPLE_SIZE = cmd_args.train_sample_size EVAL_FILE_PATH = cmd_args.eval_data @@ -133,25 +137,9 @@ if __name__ == '__main__': print("Using checkpoint dir",last_checkpoint) print(f'Load tokenizer from path: {tokenizer_path}') - tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) + tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path) print('done') - tokenizer.add_special_tokens({ - "eos_token": "</s>", - "bos_token": "<s>", - "unk_token": "<unk>", - "pad_token": "<pad>", - "mask_token": "<mask>" - }) - tokenizer.add_prefix_space=True - - - -# config=GPT2Config( -# name_or_path=MODEL, -# vocab_size=tokenizer.vocab_size, -# bos_token_id=tokenizer.bos_token_id, -# eos_token_id=tokenizer.eos_token_id config = AutoConfig.from_pretrained(MODEL) # Doesn't load weights config.vocab_size=tokenizer.vocab_size @@ -174,6 +162,7 @@ if __name__ == '__main__': if TRAIN_SAMPLE_SIZE != -1: np.random.seed(1992) train_paths = np.random.choice(train_paths,TRAIN_SAMPLE_SIZE,replace=False).tolist() + print("files:",train_paths) eval_dir = EVAL_FILE_PATH eval_paths = [str(x) for x in Path(eval_dir).glob("**/*.txt")] @@ -252,14 +241,16 @@ if __name__ == '__main__': num_train_epochs=EPOCHS, # number of training epochs per_device_train_batch_size=BATCH_SIZE, # batch size for training per_device_eval_batch_size=BATCH_SIZE, # batch size for evaluation + log_level = 'warning', evaluation_strategy='steps', save_strategy='steps', save_steps=5000, -# learning_rate=0.001, produces na early on + learning_rate=LEARNING_RATE, # set lr in ds_config.json gradient_accumulation_steps=GRAD_ACC_STEPS, eval_steps = 5000, # Number of update steps between two evaluations. + weight_decay = 0.01, - warmup_steps= 100,# number of warmup steps for learning rate scheduler + warmup_steps= 500,# number of warmup steps for learning rate scheduler fp16=True, #whether to use floating point 16 for training TESTAA TODO deepspeed=DS_CONF, ) diff --git a/model3multi/training/trainer.sh b/model3multi/training/trainer.sh new file mode 100644 index 0000000000000000000000000000000000000000..b10415a5eb8c2cbe667f7bc1a99403d55331ab83 --- /dev/null +++ b/model3multi/training/trainer.sh @@ -0,0 +1,67 @@ +#!/bin/bash +#SBATCH --job-name=4x4-1024 +#SBATCH --account=project_2004407 +##SBATCH --account=project_2004600 +#SBATCH --time=10:00:00 +##SBATCH --time=00:15:00 +#SBATCH --partition=gpumedium +#SBATCH --nodes=4 +#SBATCH --mem=100G +#SBATCH --cpus-per-task=10 +#SBATCH --gres=gpu:a100:4 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +rm -f logs/latest.out logs/latest.err + +ln -s $SLURM_JOBID.out logs/latest.out +ln -s $SLURM_JOBID.err logs/latest.err + +export TORCH_EXTENSIONS_DIR=/projappl/project_2004600/risto/model3multi/training/torch_ext_dir/ + +GPUS=$(echo $SLURM_JOB_GPUS | tr -s ', ' '[\n*]' | wc -l) + +module load pdsh/2.31 + +### CREATES HOSTFILE ### +rm -f hostfile.txt +# Create deepspeed hostfile. +scontrol show hostnames "$SLURM_JOB_NODELIST" \ + | perl -pe 's/$/ slots=4/' \ + > "hostfile.txt" +MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) + +cat hostfile.txt + +export TMPDIR=/scratch/project_2004600/risto +export HF_DATASETS_CACHE=$TMPDIR/"dataset_cache/" +echo "Using TEMP-dir " $HF_DATASETS_CACHE + + +export SING_IMAGE=/scratch/project_2004600/containers/latest.sif +export SING_FLAGS="$SING_FLAGS -B /appl/spack/v014/install-tree/gcc-4.8.5/pdsh-2.31-cdzt5w/bin/:/usr/local/sbin,/projappl/project_2004600/risto/model3multi/training/node_init.sh:/data/ --nv" +#export NCCL_DEBUG=INFO + +MODEL_OUTPUTDIR=/scratch/project_2004600/experiments/270921-mahti-4x4-gptsmall-epochs5-finbert10-1024/ + +echo "start running trainer script" +singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER_NODE /projappl/project_2004600/risto/model3multi/training/trainer.py \ + --train_data /scratch/project_2004600/FinBERT-data/batched-data-bal/train/ \ + --eval_data /scratch/project_2004600/FinBERT-data/batched-data-bal/eval/ \ + --lr 1e-4 \ + --block_size 1024 \ + --tokenizer /projappl/project_2004600/risto/tokenizer_openai/ \ + --train_sample_size 2 \ + --model_output_dir $MODEL_OUTPUTDIR \ + --from_checkpoint 'no' \ + --batch_size 15 \ + --epochs 5 \ + --grad_acc 1 \ + --model 'gpt2' \ + --cache_dir $HF_DATASETS_CACHE \ + --deepspeed --deepspeed_config /projappl/project_2004600/risto/model3multi/training/ds_config.json + +seff $SLURM_JOBID +seff $SLURM_JOBID > $MODEL_OUTPUTDIR/seff.txt +cp logs/$SLURM_JOBID* ds_config.json $MODEL_OUTPUTDIR +