diff --git a/model3multi/training/own_singularity_trainer.sh b/model3multi/training/own_singularity_trainer.sh index 0627edc41fb3c19c167ec0aef27c43321b7f46d8..fb4bf1ccc4be8ab15dd7ac711da7c4282c72063b 100644 --- a/model3multi/training/own_singularity_trainer.sh +++ b/model3multi/training/own_singularity_trainer.sh @@ -1,13 +1,13 @@ #!/bin/bash -#SBATCH --job-name=own_test +#SBATCH --job-name=2x4test #SBATCH --account=project_2004600 ##SBATCH --time=10:00:00 -#SBATCH --time=00:15:00 +#SBATCH --time=01:00:00 #SBATCH --partition=gpumedium #SBATCH --nodes=2 #SBATCH --mem=100G #SBATCH --cpus-per-task=10 -#SBATCH --gres=gpu:a100:4,nvme:200 +#SBATCH --gres=gpu:a100:4 #SBATCH -o logs/%j.out #SBATCH -e logs/%j.err @@ -21,74 +21,40 @@ GPUS=$(echo $SLURM_JOB_GPUS | tr -s ', ' '[\n*]' | wc -l) module load pdsh/2.31 -export PDSH_SSH_ARGS_APPEND="-v -o StrictHostKeyChecking=no" - ### CREATES HOSTFILE ### - rm -f hostfile.txt # Create deepspeed hostfile. - scontrol show hostnames "$SLURM_JOB_NODELIST" \ | perl -pe 's/$/ slots=4/' \ > "hostfile.txt" - -# `scontrol show hostnames` turns condenced nodelist -# (e.g. "g[1102,1201]") into list of host names (e.g. "g1102\ng1102") MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) cat hostfile.txt -#export TMPDIR=/scratch/project_2004600/risto/ +export TMPDIR=/scratch/project_2004600/risto export HF_DATASETS_CACHE=$TMPDIR/"dataset_cache/" - echo "Using TEMP-dir " $HF_DATASETS_CACHE -#### MODEL ARGS ### - -MODEL_OUT=/scratch/project_2004600/risto/model_out_TEST/ -#MODEL_OUT=/scratch/project_2004600/risto/gpt2_large_grad_acc_100/ -#MODEL_CHECKPOINT=/scratch/project_2004600/risto/gpt- -MODEL_CHECKPOINT='no' -#BATCH_SIZE=16 # for medium if blocks are 500 -BATCH_SIZE=11 # blocks are 512 -EPOCHS=1 -DS_CONFIG=/projappl/project_2004600/risto/model3multi/training/ds_config.json -#DS_CONFIG=/projappl/project_2004600/risto/testing/singularity_trainer/zero_stage_3.json -TRAIN_SAMPLE_SIZE=50 -GRAD_ACC=100 -MODEL='gpt2' -#TRAIN_DATA=$TMPDIR/gpt2-train/ -#EVAL_DATA=$TMPDIR/gpt2-eval/ -#EVAL_DATA=/scratch/project_2004600/FinBERT-data/batched-data/gpt2-eval/ -#TRAIN_DATA=/scratch/project_2004600/FinBERT-dataset-bal/prepared-dataset_t2/ -TRAIN_DATA=/scratch/project_2004600/FinBERT-data/train/ -EVAL_DATA=/scratch/project_2004600/FinBERT-data/batched-data/gpt2-eval/ -LR=0.0001 - -#/###/MODEL ARGS #### export SING_IMAGE=/scratch/project_2004600/containers/deepspeed.sif - export SING_FLAGS="$SING_FLAGS -B /appl/spack/v014/install-tree/gcc-4.8.5/pdsh-2.31-cdzt5w/bin/:/usr/local/sbin,/projappl/project_2004600/risto/model3multi/training/node_init.sh:/data/ --nv" -#export SING_FLAGS=$SING_FLAGS"--nv" -export NCCL_DEBUG=INFO - +#export NCCL_DEBUG=INFO -singularity_wrapper exec which deepspeed echo "start running trainer script" singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER_NODE /projappl/project_2004600/risto/model3multi/training/trainer.py \ - --train_data $TRAIN_DATA \ - --eval_data $EVAL_DATA \ - --lr $LR \ - --train_sample_size $TRAIN_SAMPLE_SIZE \ - --model_output_dir $MODEL_OUT \ - --from_checkpoint $MODEL_CHECKPOINT \ - --batch_size $BATCH_SIZE \ - --epochs $EPOCHS \ - --grad_acc $GRAD_ACC \ - --model $MODEL \ + --train_data /scratch/project_2004600/FinBERT-data/batched-data-bal/train/ \ + --eval_data /scratch/project_2004600/FinBERT-data/batched-data-bal/eval/ \ + --lr 5e-5 \ + --tokenizer /projappl/project_2004600/risto/tokenizer_100/ \ + --train_sample_size 2 \ + --model_output_dir /scratch/project_2004600/risto/220921-mahti-gptsmall-epochs5-finbert10-bl1024/ \ + --from_checkpoint 'no' \ + --batch_size 11 \ + --epochs 5 \ + --grad_acc 1 \ + --model 'gpt2' \ --cache_dir $HF_DATASETS_CACHE \ - --deepspeed --deepspeed_config $DS_CONFIG + --deepspeed --deepspeed_config /projappl/project_2004600/risto/model3multi/training/ds_config.json -seff $SLURM_JOBId +seff $SLURM_JOBID diff --git a/model3multi/training/trainer.py b/model3multi/training/trainer.py index 2db3b664c4f5814b0d688cbab798485ad391c516..7714cb5b614113f59b17c3b5562b8c331b93e32b 100644 --- a/model3multi/training/trainer.py +++ b/model3multi/training/trainer.py @@ -77,6 +77,9 @@ if __name__ == '__main__': help='number of training epochs') parser.add_argument('--model', type=str, help='model configuration name') + parser.add_argument('--tokenizer', type=str, + help='tokenizer path') + parser.add_argument('--grad_acc', type=int, help='number of gradient accumulation steps') @@ -94,12 +97,12 @@ if __name__ == '__main__': num_workers = 40 - block_size = 512 + block_size = 1024 overwrite_cache=False keep_in_memory=True logging.set_verbosity_debug() - tokenizer_path = '/projappl/project_2004600/risto/tokenizer/' + tokenizer_path = cmd_args.tokenizer #'/projappl/project_2004600/risto/tokenizer/' TRAIN_FILE_PATH = cmd_args.train_data TRAIN_SAMPLE_SIZE = cmd_args.train_sample_size @@ -179,7 +182,7 @@ if __name__ == '__main__': print(f"Eval files count: {len(eval_paths)}, evaluation files: ", eval_paths) start_time = perf_counter() dataset = load_dataset( -# cache_dir = cmd_args.cache_dir, + cache_dir = cmd_args.cache_dir, path = 'text', # path to loading script. 'text' is default data_files = {'train': train_paths, 'eval': eval_paths}, # keep_in_memory=keep_in_memory