From 05d7e338ad8c223fba0a94fa2f73f1863641ec93 Mon Sep 17 00:00:00 2001
From: Adam Moody <moody20@llnl.gov>
Date: Thu, 19 Aug 2021 15:40:16 -0700
Subject: [PATCH 1/3] add oscar slurm script for preprocess_data_dist

---
 data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
diff --git a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
new file mode 100644
index 00000000..085acb8f
--- /dev/null
+++ b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
@@ -0,0 +1,62 @@
+#!/bin/bash
+#SBATCH --job-name=oscar-jsonl-to-meg-gpt2-dist # job name
+#SBATCH --ntasks=320                 # number of MP tasks
+#SBATCH --nodes=8
+#SBATCH --cpus-per-task=1            # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --time=8:00:00               # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out          # output file name
+#SBATCH --account=six@cpu
+#SBATCH --partition=cpu_p1
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-prod
+
+input=$six_ALL_CCFRSCRATCH/datasets/oscar-small/oscar-en-shuffled-p1.jsonl
+output=$six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2-dist
+
+##########################
+# Setup to run preprocess_data_dist as an mpi4py job
+##########################
+
+# Note: if mpi4py works with the system MPI,
+# and if srun is configured to launch system MPI jobs,
+# then I think the following should be sufficient:
+#
+# cd $six_ALL_CCFRWORK/code/megatron-lm
+# <set CMD just as below>
+# srun -n 320 -N 8 $CMD --mpi4py
+
+##########################
+# Setup to run preprocess_data_dist as a torch.distributed job
+##########################
+
+# I use a line like the following.  Perhaps easier to reas if this works for you?
+#MASTER_ADDR=`scontrol show hostname ${SLURM_NODELIST} | head -n1`
+MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
+MASTER_PORT=6000
+
+NNODES=$SLURM_NNODES
+
+export LAUNCHER="python -u -m torch.distributed.launch \
+    --nproc_per_node 40 \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    "
+
+cd $six_ALL_CCFRWORK/code/megatron-lm
+CMD="tools/preprocess_data_dist.py \
+       --input $input \
+       --output-prefix $output \
+       --vocab data/gpt2-vocab.json \
+       --merge-file data/gpt2-merges.txt \
+       --dataset-impl mmap \
+       --tokenizer-type GPT2BPETokenizer \
+       --append-eod
+       "
+
+srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD'
+
+#echo "now copy the results to $six_ALL_CCFRWORK/datasets-custom/oscar/ from $six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2"

From 5472a3bd623f5fa05f036d8916206c836015697c Mon Sep 17 00:00:00 2001
From: Adam Moody <moody20@llnl.gov>
Date: Thu, 19 Aug 2021 15:48:42 -0700
Subject: [PATCH 2/3] fix a couple typos

---
 data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
index 085acb8f..0fc09547 100644
--- a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
+++ b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
@@ -26,13 +26,13 @@ output=$six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2-dist
 #
 # cd $six_ALL_CCFRWORK/code/megatron-lm
 # <set CMD just as below>
-# srun -n 320 -N 8 $CMD --mpi4py
+# srun -n 320 -N 8 python $CMD --mpi4py
 
 ##########################
 # Setup to run preprocess_data_dist as a torch.distributed job
 ##########################
 
-# I use a line like the following.  Perhaps easier to reas if this works for you?
+# I use a line like the following.  Perhaps easier to read if this works for you?
 #MASTER_ADDR=`scontrol show hostname ${SLURM_NODELIST} | head -n1`
 MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
 MASTER_PORT=6000

From 7677577aa5dd06857614866f50cb21ffbd056730 Mon Sep 17 00:00:00 2001
From: Adam Moody <moody20@llnl.gov>
Date: Fri, 20 Aug 2021 09:54:52 -0700
Subject: [PATCH 3/3] simplify scontrol line to get first host

---
 data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
index 0fc09547..92bd5bb6 100644
--- a/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
+++ b/data/oscar/oscar-jsonl-to-meg-gpt2-dist.slurm
@@ -33,7 +33,7 @@ output=$six_ALL_CCFRSCRATCH/datasets/oscar-small/meg-gpt2-dist
 ##########################
 
 # I use a line like the following.  Perhaps easier to read if this works for you?
-#MASTER_ADDR=`scontrol show hostname ${SLURM_NODELIST} | head -n1`
+#MASTER_ADDR=`scontrol show hostname | head -1`
 MASTER_ADDR=`perl -le '$_=$ENV{"SLURM_JOB_NODELIST"}; s/,.*//; s/-.*//; s/\[//; print'`
 MASTER_PORT=6000