From 05d7e338ad8c223fba0a94fa2f73f1863641ec93 Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Thu, 19 Aug 2021 15:40:16 -0700 Subject: [PATCH 1/3] add oscar slurm script for preprocess_data_dist --- data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm diff --git a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm new file mode 100644 index 00000000..085acb8f --- /dev/null +++ b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --job-name=oscar-jsonl-to-meg-gpt2-dist # job name +#SBATCH --ntasks=320 # number of MP tasks +#SBATCH --nodes=8 +#SBATCH --cpus-per-task=1 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time=8:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --account=six@cpu +#SBATCH --partition=cpu_p1 + +set -x -e + +source $six_ALL_CCFRWORK/start-prod + +input=$six_ALL_CCFRSCRATCH/datasets/oscar-small/oscar-en-shuffled-p1.jsonl +output=$six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2-dist + +########################## +# Setup to run preprocess_data_dist as an mpi4py job +########################## + +# Note: if mpi4py works with the system MPI, +# and if srun is configured to launch system MPI jobs, +# then I think the following should be sufficient: +# +# cd $six_ALL_CCFRWORK/code/megatron-lm +# +# srun -n 320 -N 8 $CMD --mpi4py + +########################## +# Setup to run preprocess_data_dist as a torch.distributed job +########################## + +# I use a line like the following. Perhaps easier to reas if this works for you? +#MASTER_ADDR=`scontrol show hostname ${SLURM_NODELIST} | head -n1` +MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'` +MASTER_PORT=6000 + +NNODES=$SLURM_NNODES + +export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node 40 \ + --nnodes $NNODES \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + " + +cd $six_ALL_CCFRWORK/code/megatron-lm +CMD="tools/preprocess_data_dist.py \ + --input $input \ + --output-prefix $output \ + --vocab data/gpt2-vocab.json \ + --merge-file data/gpt2-merges.txt \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --append-eod + " + +srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' + +#echo "now copy the results to $six_ALL_CCFRWORK/datasets-custom/oscar/ from $six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2" From 5472a3bd623f5fa05f036d8916206c836015697c Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Thu, 19 Aug 2021 15:48:42 -0700 Subject: [PATCH 2/3] fix a couple typos --- data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm index 085acb8f..0fc09547 100644 --- a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm +++ b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm @@ -26,13 +26,13 @@ output=$six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2-dist # # cd $six_ALL_CCFRWORK/code/megatron-lm # -# srun -n 320 -N 8 $CMD --mpi4py +# srun -n 320 -N 8 python $CMD --mpi4py ########################## # Setup to run preprocess_data_dist as a torch.distributed job ########################## -# I use a line like the following. Perhaps easier to reas if this works for you? +# I use a line like the following. Perhaps easier to read if this works for you? #MASTER_ADDR=`scontrol show hostname ${SLURM_NODELIST} | head -n1` MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'` MASTER_PORT=6000 From 7677577aa5dd06857614866f50cb21ffbd056730 Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Fri, 20 Aug 2021 09:54:52 -0700 Subject: [PATCH 3/3] simplify scontrol line to get first host --- data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm index 0fc09547..92bd5bb6 100644 --- a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm +++ b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm @@ -33,7 +33,7 @@ output=$six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2-dist ########################## # I use a line like the following. Perhaps easier to read if this works for you? -#MASTER_ADDR=`scontrol show hostname ${SLURM_NODELIST} | head -n1` +#MASTER_ADDR=`scontrol show hostname | head -1` MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'` MASTER_PORT=6000