Skip to content
Snippets Groups Projects
Commit fc82462c authored by Risto Luukkonen's avatar Risto Luukkonen
Browse files

cont'd refactor

parent 38c2135b
No related branches found
No related tags found
No related merge requests found
g1102 slots=4
g4102 slots=4
g6101 slots=4
g6102 slots=4
#!/bin/bash
# Initialize compute node environment.
# When using deepspeed and pdsh, source this script in
# deepspeed/launcher/multinode_runner.py before calling
# deepspeed.launcher.launch.
CSC_ENV_INIT='/appl/profile/zz-csc-env.sh'
if [ -f "$CSC_ENV_INIT" ]; then
echo "$0: sourcing $CSC_ENV_INIT" >&2
source "$CSC_ENV_INIT"
else
echo "$0: no $CSC_ENV_INIT, exiting"
exit 1
fi
module purge
export SING_IMAGE=/scratch/project_2004600/containers/latest.sif
#!/bin/bash
#SBATCH --job-name=large_
#SBATCH --account=project_2004600
##SBATCH --time=72:00:00
#SBATCH --time=00:15:00
#SBATCH --partition=gputest
#SBATCH --nodes=2
#SBATCH --mem=200G
#SBATCH --cpus-per-task=40
#SBATCH --gres=gpu:v100:4,nvme:200
#SBATCH -o logs/%j.out
#SBATCH -e logs/%j.err
rm -f logs/latest.out logs/latest.err
ln -s $SLURM_JOBID.out logs/latest.out
ln -s $SLURM_JOBID.err logs/latest.err
export TORCH_EXTENSIONS_DIR=/projappl/project_2004600/risto/testing/singularity_trainer/ds_env/torch_ext/
GPUS=$(echo $SLURM_JOB_GPUS | tr -s ', ' '[\n*]' | wc -l)
module load pytorch
module load gcc/9.1.0
module load cuda/11.1.0
module load pdsh/2.34
### CREATES HOSTFILE ###
rm -f hostfile.txt
while IFS=',' read -ra ADDR; do
for i in "${ADDR[@]}"; do
echo $i slots=$GPUS >>hostfile.txt
done
done <<< $SLURM_JOB_NODELIST
### /CREATES HOSTFILE ###
cat hostfile.txt
export SINGULARITYENV_APPEND_PATH="/users/rluukkon/.local/bin"
export CPATH=/appl/spack/install-tree/gcc-9.1.0/python-3.6.8-ecovls/include/python3.6m:$CPATH
#export TMPDIR=/scratch/project_2004600/risto/
export HF_DATASETS_CACHE=$TMPDIR/"dataset_cache/"
echo "Using TEMP-dir " $HF_DATASETS_CACHE
#export PATH=$HOME/local/bin:$HOME/.local/bin:$PATH
echo $PATH
#export PATH="$HOME/local/bin:$PATH"
export LD_LIBRARY_PATH=$HOME/.local/lib/python3.8/site-packages/:$LD_LIBRARY_PATH
#export HF_DATASETS_IN_MEMORY_MAX_SIZE=0
#echo "Copying data to local storage"
#cp -r /scratch/project_2004600/risto/batched-data $TMPDIR
#cp -r /scratch/project_2004600/FinBERT-data/gpt2-eval $TMPDIR
#echo "done"
#MODEL_OUT=/scratch/project_2004600/risto/model_out_TEST/
MODEL_OUT=/scratch/project_2004600/risto/gpt2_large_all_data/
#MODEL_CHECKPOINT=/scratch/project_2004600/risto/model_long_2_1x4/
MODEL_CHECKPOINT='no'
#BATCH_SIZE=16 # for medium if blocks are 500
BATCH_SIZE=16 # blocks are 250
EPOCHS=5
DS_CONFIG=/projappl/project_2004600/risto/model3multi/training/ds_config.json
#DS_CONFIG=/projappl/project_2004600/risto/testing/singularity_trainer/zero_stage_3.json
TRAIN_SAMPLE_SIZE=10
GRAD_ACC=1
MODEL='gpt2'
#TRAIN_DATA=$TMPDIR/batched-data/
#EVAL_DATA=$TMPDIR/gpt2-eval/
#EVAL_DATA=/scratch/project_2004600/FinBERT-data/gpt2-eval/
#TRAIN_DATA=/scratch/project_2004600/risto/batched-data/
TRAIN_DATA=/scratch/project_2004600/FinBERT-data/train/
EVAL_DATA=/scratch/project_2004600/FinBERT-data/eval/
echo "start running trainer script"
srun deepspeed_singularity --hostfile=hostfile.txt /projappl/project_2004600/risto/model3multi/training/trainer.py \
--train_data $TRAIN_DATA \
--train_sample_size $TRAIN_SAMPLE_SIZE \
--eval_data $EVAL_DATA \
--model_output_dir $MODEL_OUT \
--from_checkpoint $MODEL_CHECKPOINT \
--batch_size $BATCH_SIZE \
--epochs $EPOCHS \
--grad_acc $GRAD_ACC \
--model $MODEL \
--cache_dir $HF_DATASETS_CACHE \
--deepspeed --deepspeed_config $DS_CONFIG
seff $SLURM_JOBId
......@@ -29,14 +29,15 @@ export TMPDIR=/scratch/project_2004600/risto
export HF_DATASETS_CACHE=$TMPDIR/"dataset_cache/"
echo "Using TEMP-dir " $HF_DATASETS_CACHE
export SING_IMAGE=/scratch/project_2004600/containers/latest.sif
export SING_FLAGS="$SING_FLAGS -B /appl/spack/v014/install-tree/gcc-4.8.5/pdsh-2.31-cdzt5w/bin/:/usr/local/sbin,"$pwd"/node_init.sh:/data/ --nv"
export SING_FLAGS="$SING_FLAGS -B /appl/spack/v014/install-tree/gcc-4.8.5/pdsh-2.31-cdzt5w/bin/:/usr/local/sbin,$(pwd)/node_init.sh:/data/ --nv"
echo $SING_FLAGS
#export NCCL_DEBUG=INFO
MODEL_OUTPUTDIR=/scratch/project_2004600/experiments/270921-mahti-4x4-gptsmall-epochs5-finbert10-1024/
echo "start running trainer script"
singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER_NODE $pwd/trainer.py \
singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER_NODE $(pwd)/trainer.py \
--train_data /scratch/project_2004600/FinBERT-data/batched-data-bal/train/ \
--eval_data /scratch/project_2004600/FinBERT-data/batched-data-bal/eval/ \
--lr 1e-4 \
--lr 5e-5 \
--block_size 1024 \
--tokenizer /projappl/project_2004600/risto/tokenizer_openai/ \
--train_sample_size 2 \
......@@ -47,7 +48,7 @@ singularity_wrapper exec deepspeed --hostfile=hostfile.txt --master_addr=$MASTER
--grad_acc 1 \
--model 'gpt2' \
--cache_dir $HF_DATASETS_CACHE \
--deepspeed --deepspeed_config $pwd
--deepspeed --deepspeed_config $(pwd)/ds_config.json
seff $SLURM_JOBID
seff $SLURM_JOBID > $MODEL_OUTPUTDIR/seff.txt
......
......@@ -17,7 +17,7 @@ from transformers import (
logging,
WEIGHTS_NAME,
CONFIG_NAME,
GPT2Tokenizer,
GPT2TokenizerFast,
GPT2LMHeadModel,
GPT2Config,
Trainer,
......@@ -77,6 +77,13 @@ if __name__ == '__main__':
help='number of training epochs')
parser.add_argument('--model', type=str,
help='model configuration name')
parser.add_argument('--tokenizer', type=str,
help='tokenizer path')
parser.add_argument('--lr', type=float,
help='learning rate')
parser.add_argument('--block_size',type=int,
help='data block size')
parser.add_argument('--grad_acc', type=int,
help='number of gradient accumulation steps')
......@@ -94,13 +101,13 @@ if __name__ == '__main__':
num_workers = 40
block_size = 300
block_size = cmd_args.block_size
overwrite_cache=False
keep_in_memory=True
logging.set_verbosity_debug()
tokenizer_path = '/projappl/project_2004600/risto/tokenizer/'
#logging.set_verbosity_debug()
tokenizer_path = cmd_args.tokenizer #'/projappl/project_2004600/risto/tokenizer/'
LEARNING_RATE = cmd_args.lr
TRAIN_FILE_PATH = cmd_args.train_data
TRAIN_SAMPLE_SIZE = cmd_args.train_sample_size
EVAL_FILE_PATH = cmd_args.eval_data
......@@ -130,25 +137,9 @@ if __name__ == '__main__':
print("Using checkpoint dir",last_checkpoint)
print(f'Load tokenizer from path: {tokenizer_path}')
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path)
print('done')
tokenizer.add_special_tokens({
"eos_token": "</s>",
"bos_token": "<s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>"
})
tokenizer.add_prefix_space=True
# config=GPT2Config(
# name_or_path=MODEL,
# vocab_size=tokenizer.vocab_size,
# bos_token_id=tokenizer.bos_token_id,
# eos_token_id=tokenizer.eos_token_id
config = AutoConfig.from_pretrained(MODEL) # Doesn't load weights
config.vocab_size=tokenizer.vocab_size
......@@ -171,6 +162,7 @@ if __name__ == '__main__':
if TRAIN_SAMPLE_SIZE != -1:
np.random.seed(1992)
train_paths = np.random.choice(train_paths,TRAIN_SAMPLE_SIZE,replace=False).tolist()
print("files:",train_paths)
eval_dir = EVAL_FILE_PATH
eval_paths = [str(x) for x in Path(eval_dir).glob("**/*.txt")]
......@@ -179,7 +171,7 @@ if __name__ == '__main__':
print(f"Eval files count: {len(eval_paths)}, evaluation files: ", eval_paths)
start_time = perf_counter()
dataset = load_dataset(
# cache_dir = cmd_args.cache_dir,
cache_dir = cmd_args.cache_dir,
path = 'text', # path to loading script. 'text' is default
data_files = {'train': train_paths, 'eval': eval_paths},
# keep_in_memory=keep_in_memory
......@@ -249,14 +241,16 @@ if __name__ == '__main__':
num_train_epochs=EPOCHS, # number of training epochs
per_device_train_batch_size=BATCH_SIZE, # batch size for training
per_device_eval_batch_size=BATCH_SIZE, # batch size for evaluation
log_level = 'warning',
evaluation_strategy='steps',
save_strategy='steps',
save_steps=10000,
# learning_rate=0.001, produces na early on
save_steps=5000,
learning_rate=LEARNING_RATE, # set lr in ds_config.json
gradient_accumulation_steps=GRAD_ACC_STEPS,
eval_steps = 10000, # Number of update steps between two evaluations.
eval_steps = 5000, # Number of update steps between two evaluations.
weight_decay = 0.01,
warmup_steps= 100,# number of warmup steps for learning rate scheduler
warmup_steps= 500,# number of warmup steps for learning rate scheduler
fp16=True, #whether to use floating point 16 for training TESTAA TODO
deepspeed=DS_CONF,
)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment