Skip to content
Snippets Groups Projects
Commit 38c2135b authored by Risto Luukkonen's avatar Risto Luukkonen
Browse files

refactor

parent 962297dd
No related branches found
No related tags found
No related merge requests found
#!/bin/bash
#SBATCH --job-name=gpt2-large-all_data
#SBATCH --account=project_2004600
#SBATCH --time=72:00:00
#SBATCH --partition=gpu
#SBATCH --job-name=4x4-1024
#SBATCH --account=project_2004407
##SBATCH --account=project_2004600
#SBATCH --time=10:00:00
##SBATCH --time=00:15:00
##SBATCH --partition=gputest
#SBATCH --mem=0
#SBATCH --cpus-per-task=40
#SBATCH --gres=gpu:v100:4 #,nvme:120
#SBATCH --output=%A_%x.txt
export TORCH_EXTENSIONS_DIR=/projappl/project_2004600/risto/testing/ds_env/torch_ext/
module load gcc/9.1.0
module load cuda/11.1.0
export CPATH=/appl/spack/install-tree/gcc-9.1.0/python-3.6.8-ecovls/include/python3.6m:$CPATH
export TMPDIR=/scratch/project_2004600/risto/
#SBATCH --partition=gpumedium
#SBATCH --nodes=4
#SBATCH --mem=100G
#SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:a100:4
#SBATCH -o logs/%j.out
#SBATCH -e logs/%j.err
rm -f logs/latest.out logs/latest.err
ln -s $SLURM_JOBID.out logs/latest.out
ln -s $SLURM_JOBID.err logs/latest.err
export TORCH_EXTENSIONS_DIR=$pwd/torch_ext_dir/
GPUS=$(echo $SLURM_JOB_GPUS | tr -s ', ' '[\n*]' | wc -l)
module load pdsh/2.31
### CREATES HOSTFILE ###
rm -f hostfile.txt
# Create deepspeed hostfile.
scontrol show hostnames "$SLURM_JOB_NODELIST" \
| perl -pe 's/$/ slots=4/' \
> "hostfile.txt"
MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
cat hostfile.txt
export TMPDIR=/scratch/project_2004600/risto
export HF_DATASETS_CACHE=$TMPDIR/"dataset_cache/"
echo "Using TEMP-dir " $HF_DATASETS_CACHE
#export HF_DATASETS_IN_MEMORY_MAX_SIZE=40G
source /projappl/project_2004600/testing/venv_trainer/venv/bin/activate
#MODEL_CHECKPOINT='no'
#MODEL_OUT=/scratch/project_2004600/risto/model_output_TEST/
MODEL_OUT=/scratch/project_2004600/risto/gpt2-large_FULL_DATA_250blocks/
#MODEL_CHECKPOINT=/scratch/project_2004600/risto/model_long_4_1x4_grad_acc_1/
#MODEL_CHECKPOINT=/scratch/project_2004600/risto/model_long_4_1x4_grad_acc_1/
TRAIN_DATA=/scratch/project_2004600/risto/batched-data/
EVAL_DATA=/scratch/project_2004600/FinBERT-data/gpt2-eval/
MODEL_CHECKPOINT='no'
BATCH_SIZE=20 #for large
#BATCH_SIZE=70 # for medium
EPOCHS=10
DS_CONFIG=/projappl/project_2004600/risto/testing/venv_trainer/ds_config.json
TRAIN_SAMPLE_SIZE=-1
GRAD_ACC=1
MODEL='gpt2-large'
export SING_IMAGE=/scratch/project_2004600/containers/latest.sif
export SING_FLAGS="$SING_FLAGS -B /appl/spack/v014/install-tree/gcc-4.8.5/pdsh-2.31-cdzt5w/bin/:/usr/local/sbin,"$pwd"/node_init.sh:/data/ --nv"
#export NCCL_DEBUG=INFO
MODEL_OUTPUTDIR=/scratch/project_2004600/experiments/270921-mahti-4x4-gptsmall-epochs5-finbert10-1024/
echo "start running trainer script"
srun deepspeed /projappl/project_2004600/risto/testing/venv_trainer/trainer.py \
--train_data $TRAIN_DATA \
--train_sample_size $TRAIN_SAMPLE_SIZE \
--eval_data $EVAL_DATA \
--model_output_dir $MODEL_OUT \
--from_checkpoint $MODEL_CHECKPOINT \
--batch_size $BATCH_SIZE \
--epochs $EPOCHS \
--model $MODEL \
--grad_acc $GRAD_ACC \
singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER_NODE $pwd/trainer.py \
--train_data /scratch/project_2004600/FinBERT-data/batched-data-bal/train/ \
--eval_data /scratch/project_2004600/FinBERT-data/batched-data-bal/eval/ \
--lr 1e-4 \
--block_size 1024 \
--tokenizer /projappl/project_2004600/risto/tokenizer_openai/ \
--train_sample_size 2 \
--model_output_dir $MODEL_OUTPUTDIR \
--from_checkpoint 'no' \
--batch_size 15 \
--epochs 5 \
--grad_acc 1 \
--model 'gpt2' \
--cache_dir $HF_DATASETS_CACHE \
--deepspeed --deepspeed_config $DS_CONFIG
--deepspeed --deepspeed_config $pwd
seff $SLURM_JOBID
seff $SLURM_JOBID > $MODEL_OUTPUTDIR/seff.txt
cp logs/$SLURM_JOBID* ds_config.json $MODEL_OUTPUTDIR
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment