From ea0c3c762b2c1fef35898d8952a6129a31f1645d Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Fri, 30 Aug 2024 15:09:11 -0500
Subject: [PATCH 01/31] fixed dftracer compatibility

---
 megatron/utils.py | 73 ++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 7a1d60bab92..19eef703998 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -36,12 +36,22 @@
 
 
 _DLIO_PROFILER_EXIST = True
+_DFTRACER_EXIST=True
+
+try:
+    import dftracer
+except:
+    _DFTRACER_EXIST=True
+    
 try:
     import dlio_profiler
 except:
     _DLIO_PROFILER_EXIST = False
 
-if _DLIO_PROFILER_EXIST:
+    
+if _DFTRACER_EXIST:
+    from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE
+elif _DLIO_PROFILER_EXIST:
     from dlio_profiler.logger import fn_interceptor as Profile
     from dlio_profiler.logger import dlio_logger as PerfTrace
 else:
@@ -50,40 +60,45 @@
     # Profile: ContextManager = nullcontext
     #
     # class Profile(nullable_schema)
-
-    class Profile:
-        def __init__(self, type="PROFILER"):
-            self._start = time.perf_counter()
-            self.type = type
-
-        def log(self, func):
+    class Profile(object):
+        def __init__(self,  cat, name=None, epoch=None, step=None, image_idx=None, image_size=None):
+            return 
+        def log(self,  func):
+            return func
+        def log_init(self,  func):
+            return func
+        def iter(self,  func, iter_name="step"):
             return func
-
-        def iter(self, a):
-            return a
-
         def __enter__(self):
-            self._start = time.perf_counter()
-
-        def __exit__(self, *args, **kwargs):
-            dt = time.perf_counter() - self._start
-            log.info(f"{self.type} took: {dt:.6f}s")
-
-
-    class dlio_logger:
-        def __init__(
-            self,
-        ):
+            return
+        def __exit__(self, type, value, traceback):
+            return
+        def update(self, epoch=None, step=None, image_idx=None, image_size=None, args={}):
+            return
+        def flush(self):
+            return
+        def reset(self):
+            return
+        def log_static(self, func):
+            return
+    class dftracer(object):
+        def __init__(self,):
             self.type = None
-
         def initialize_log(self, logfile=None, data_dir=None, process_id=-1):
             return
+        def get_time(self):
+            return
+        def enter_event(self):
+            return
+        def exit_event(self):
+            return
+        def log_event(self, name, cat, start_time, duration, string_args=None):
+            return
+        def finalize(self):
+            return
 
-        def iter(self, a):
-            return a
-
-    PerfTrace = dlio_logger()
-
+    PerfTrace = dftracer()
+    DFTRACER_ENABLE = False
 
 def get_logger(
         name: str,

From a0ac7503d99560ba2f968c816812fabf81b3608a Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 3 Sep 2024 14:55:24 -0500
Subject: [PATCH 02/31] added requirements.txt

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000000..4f758bed7d9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+pydftracer
+wandb
+git+https://github.com/saforem2/ezpz@main

From de7f22f35361e0eff628cb1d61d26ae94234876b Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Wed, 4 Sep 2024 10:00:00 -0500
Subject: [PATCH 03/31] Update utils.py

---
 megatron/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 19eef703998..67dc366d50c 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -80,7 +80,7 @@ def flush(self):
         def reset(self):
             return
         def log_static(self, func):
-            return
+            return func
     class dftracer(object):
         def __init__(self,):
             self.type = None

From 12f6f8e5563e5fa1d21838c2a273e8cd5e096c0c Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 12 Sep 2024 05:11:58 +0000
Subject: [PATCH 04/31] fix check

---
 megatron/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 67dc366d50c..8a9f3e78581 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -41,7 +41,7 @@
 try:
     import dftracer
 except:
-    _DFTRACER_EXIST=True
+    _DFTRACER_EXIST=False
     
 try:
     import dlio_profiler
@@ -80,7 +80,7 @@ def flush(self):
         def reset(self):
             return
         def log_static(self, func):
-            return func
+            return
     class dftracer(object):
         def __init__(self,):
             self.type = None

From 53941565b264a8eb6e8dde5ca13aaad8d426b2f8 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Sat, 12 Oct 2024 12:41:17 +0000
Subject: [PATCH 05/31] shuffle concate dataset index

---
 ALCF/test_blendable_dataset.py                | 40 ++++++++++++++++++-
 .../pipeline_parallel/p2p_communication.py    | 20 +++++-----
 megatron/data/gpt_dataset.py                  | 14 +++----
 3 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/ALCF/test_blendable_dataset.py b/ALCF/test_blendable_dataset.py
index a3cabddd293..c1198621421 100644
--- a/ALCF/test_blendable_dataset.py
+++ b/ALCF/test_blendable_dataset.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import time
+import json
 start_time = time.time()
 from mpi4py import MPI
 import os
@@ -37,7 +38,7 @@ def print_rank_0(msg):
 
 os.makedirs(args.trace_dir, exist_ok=True)
 
-
+corpus_all = []
 data_file_list = args.data_file_list
 print_rank_0(f"Reading data from {args.data_file_list}")
 files = []
@@ -51,6 +52,9 @@ def print_rank_0(msg):
         files.append(float(w))
         files.append(fname)
         files.append(c)
+        if c not in corpus_all:
+            corpus_all.append(c)
+            
 splits_string="100,0,0"
 
 weights = np.array(weights)
@@ -82,6 +86,40 @@ def print_rank_0(msg):
 print_rank_0(f"Total number of samples: {len(train_ds)}")
 print_rank_0(f"Weights set: {weights[:min(8, num_datasets)]}")
 
+
+def get_sample_info(blendable_dataset, idx):
+    # corpus dataset
+    cd = blendable_dataset.dataset_index[idx]
+    # index within the corpus dataset
+    cds = blendable_dataset.dataset_sample_index[idx]
+    # dataset index within each corpus
+    fcd = blendable_dataset.datasets[cd].dataset_index[cds]
+    # sample index within the dataset
+    fcds = blendable_dataset.datasets[cd].dataset_sample_index[cds]
+    # corresponding data file
+    prefix = blendable_dataset.datasets[cd].dataset_builders[fcd].prefix
+    corpus = blendable_dataset.datasets[cd].dataset_builders[fcd].corpus
+    #v = blendable_dataset[idx]['text']
+    #norm = np.linalg.norm(v)
+    return prefix, corpus, fcds
+
+num_batches =  args.train_iters
+print(f"global_batch_size: {args.global_batch_size}")
+print(f"number of batches: {num_batches}")
+    
+fout = open("samples_list.jsonl", "w")
+if comm.rank == 0:
+    for i in range(num_batches):
+        ns_corpus = {}
+        for c in corpus_all:
+            ns_corpus[c] = 0
+        for j in range(args.global_batch_size):
+            prefix, corpus, idx = get_sample_info(train_ds, i*args.global_batch_size+j)
+            ns_corpus[corpus] +=1
+            fout.write(f"\u007b 'batch': {i}, 'sample': {j}, 'corpus': '{corpus}', 'prefix': '{prefix}', 'dataset_sample_index': {idx} \u007d\n")
+        fout.write(f"\u007b 'batch': {i}, 'histogram': {ns_corpus} \u007d \n")
+comm.Barrier()        
+exit()
 start_build_dataloader = time.time()
 print_rank_0(f"Starting to build the data loader")
 rank_in_parallel_group = mpu.get_sequence_parallel_rank()
diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py
index b23f6c84b3f..64f75ad0053 100644
--- a/megatron/core/pipeline_parallel/p2p_communication.py
+++ b/megatron/core/pipeline_parallel/p2p_communication.py
@@ -16,7 +16,8 @@
 
 from megatron.core import ModelParallelConfig
 from deepspeed.accelerator import get_accelerator
-
+from megatron.utils import Profile
+Profile("PIPELINE")
 # Types
 Shape = Union[List[int], torch.Size]
 
@@ -329,6 +330,7 @@ def _ring_exchange_wrapper(**kwargs):
     return tensor_recv_prev, tensor_recv_next, reqs
 
 
+@dlp.log
 def recv_forward(tensor_shape: Shape,
                  config: ModelParallelConfig) -> torch.Tensor:
     """ Receive tensor from previous rank in pipeline (forward receive).
@@ -353,7 +355,7 @@ def recv_forward(tensor_shape: Shape,
             config.timers('forward-recv').stop()
     return input_tensor
 
-
+@dlp.log
 def recv_backward(tensor_shape: Shape,
                   config: ModelParallelConfig) -> torch.Tensor:
     """Receive tensor from next rank in pipeline (backward receive).
@@ -376,7 +378,7 @@ def recv_backward(tensor_shape: Shape,
             config.timers('backward-recv').stop()
     return output_tensor_grad
 
-
+@dlp.log
 def send_forward(output_tensor: torch.Tensor,
                  config: ModelParallelConfig) -> None:
     """Send tensor to next rank in pipeline (forward send).
@@ -397,7 +399,7 @@ def send_forward(output_tensor: torch.Tensor,
         if config.timers is not None:
             config.timers('forward-send').stop()
 
-
+@dlp.log
 def send_backward(input_tensor_grad: torch.Tensor,
                   config: ModelParallelConfig) -> None:
     """Send tensor to previous rank in pipeline (backward send).
@@ -417,7 +419,7 @@ def send_backward(input_tensor_grad: torch.Tensor,
         if config.timers is not None:
             config.timers('backward-send').stop()
 
-
+@dlp.log
 def send_forward_recv_backward(output_tensor: torch.Tensor,
                                tensor_shape: Shape,
                                config: ModelParallelConfig) -> torch.Tensor:
@@ -441,7 +443,7 @@ def send_forward_recv_backward(output_tensor: torch.Tensor,
             config.timers('forward-send-backward-recv').stop()
     return output_tensor_grad
 
-
+@dlp.log
 def send_backward_recv_forward(input_tensor_grad: torch.Tensor,
                                tensor_shape: Shape,
                                config: ModelParallelConfig) -> torch.Tensor:
@@ -465,7 +467,7 @@ def send_backward_recv_forward(input_tensor_grad: torch.Tensor,
             config.timers('backward-send-forward-recv').stop()
     return input_tensor
 
-
+@dlp.log
 def send_forward_recv_forward(output_tensor: torch.Tensor,
                               recv_prev: bool,
                               tensor_shape: Shape,
@@ -491,7 +493,7 @@ def send_forward_recv_forward(output_tensor: torch.Tensor,
         return input_tensor, wait_handles
     return input_tensor
 
-
+@dlp.log
 def send_backward_recv_backward(input_tensor_grad: torch.Tensor,
                                 recv_next: bool,
                                 tensor_shape: Shape,
@@ -517,7 +519,7 @@ def send_backward_recv_backward(input_tensor_grad: torch.Tensor,
         return output_tensor_grad, wait_handles
     return output_tensor_grad
 
-
+@dlp.log
 def send_forward_backward_recv_forward_backward(
         output_tensor: torch.Tensor,
         input_tensor_grad: torch.Tensor,
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 0cf97356a41..c801a6a5ae5 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -114,8 +114,10 @@ def _build_indices():
                     print_rank_0('> elapsed time for building concat dataset indices: '
                                  '{:.2f} (sec)'.format(time.time() - start_time))
                     return dataset_index, dataset_sample_index
-                
+
                 self.dataset_index, self.dataset_sample_index = _build_indices()
+                np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
+                self.shuffle_index=np_rng.shuffle(range(self.num_samples))
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 
@@ -125,13 +127,9 @@ def __len__(self):
 
             @dlp.log
             def __getitem__(self, idx):
-                if idx >= self.num_samples:
-                    print_rank_0(f"WARNING: index overflow encountered {idx} > {self.num_samples} for {self.dataset_builders[0].corpus}; will randomly pick one sample")
-                    id = np.random.randint(self.num_samples)
-                else:
-                    id = idx
-                i = self.dataset_index[idx]
-                j = self.dataset_sample_index[idx]
+                id_shuffle = self.shuffle_index[idx]
+                i = self.dataset_index[id_shuffle]
+                j = self.dataset_sample_index[id_shuffle]
                 if self.dataset_builders[i].build:
                     return self.dataset_builders[i].dataset[j]
                 else:

From 573b668ef20f4f5060937511169dfcd2c877a74c Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Sat, 12 Oct 2024 09:27:04 -0500
Subject: [PATCH 06/31] fixed bugs

---
 megatron/core/pipeline_parallel/p2p_communication.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py
index 64f75ad0053..78e43e7fed1 100644
--- a/megatron/core/pipeline_parallel/p2p_communication.py
+++ b/megatron/core/pipeline_parallel/p2p_communication.py
@@ -17,7 +17,7 @@
 from megatron.core import ModelParallelConfig
 from deepspeed.accelerator import get_accelerator
 from megatron.utils import Profile
-Profile("PIPELINE")
+dlp = Profile("PIPELINE")
 # Types
 Shape = Union[List[int], torch.Size]
 

From 9de83a9684742081e8dabd82f59ede7c7fdfffb3 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Sat, 12 Oct 2024 15:56:18 -0500
Subject: [PATCH 07/31] Fix `shuffle_idx` in `megatron/data/gpt_dataset.py`

---
 megatron/data/gpt_dataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index c801a6a5ae5..a8457609db2 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -117,6 +117,8 @@ def _build_indices():
 
                 self.dataset_index, self.dataset_sample_index = _build_indices()
                 np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
+                self.shuffle_index = np.arange(self.num_samples)
+                np_rng.shuffle(self.shuffle_index)
                 self.shuffle_index=np_rng.shuffle(range(self.num_samples))
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","

From d7a2594a87fd0ffcd8eb3a6f102af809cf397ab2 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Sat, 12 Oct 2024 16:24:02 -0500
Subject: [PATCH 08/31] Fix `shuffle_idx` in `megatron/data/gpt_dataset.py`

---
 megatron/data/gpt_dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index a8457609db2..8501324752e 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -119,7 +119,6 @@ def _build_indices():
                 np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
                 self.shuffle_index = np.arange(self.num_samples)
                 np_rng.shuffle(self.shuffle_index)
-                self.shuffle_index=np_rng.shuffle(range(self.num_samples))
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 

From 3e33a6a66d0dae2328e02cad1c760b862c0064dc Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Sun, 13 Oct 2024 10:15:30 -0500
Subject: [PATCH 09/31] Update `ALCF/helpers.sh`, `train_aGPT_7B.sh`

---
 ALCF/helpers.sh  | 432 +++++++++++++++++++++++++++++++----------------
 train_aGPT_7B.sh |  13 +-
 2 files changed, 297 insertions(+), 148 deletions(-)

diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh
index bc31c322ce7..5df9a2c7a58 100644
--- a/ALCF/helpers.sh
+++ b/ALCF/helpers.sh
@@ -120,14 +120,15 @@ setup() {
     # Create `deepspeed_config.json` from runtime params from ^
     buildDSconfig || exit
     # Specify output directory for {logs, checkpoints, etc.}
+    setup_checkpoint || exit
     setOutput || exit
     # Specify additional `deepspeed` arguments (dependent on _newly created_ variables)
     set_args || exit
     # Ensure executable exists in expected path
     check_executable "${EXEC:-${WORKING_DIR}/pretrain_gpt_alcf.py}"
-    dfl="${DATA_FILE_LIST:-}"
+    dfl="${DATA_FILE_LIST:-"${PBS_O_WORKDIR}/ALCF/data-lists/$(get_machine_name)/dolma.txt"}"
     # Setup data + tokenizer via `DATA_FILE_LIST` and `TOKENIZER_TYPE`
-    tok="${TOKENIZER_TYPE:-Llama2}"
+    tok="${TOKENIZER_TYPE:-Llama2Tokenizer}"
     setup_tokenizer_and_data "${tok}" "${dfl}" || exit
     make_data || exit
     # Print job info
@@ -140,6 +141,7 @@ setup() {
     setup_run_cmd "$@" || exit
 }
 
+
 #####################################################
 # setup_run_cmd
 #
@@ -150,7 +152,8 @@ setup_run_cmd() {
     # take in additional arguments
     # and append them directly to
     # the end of the `run_cmd`
-    custom_args="$@"
+    # custom_args="$@"
+    custom_args=("$@")
     ##############################
     #### Make it easy to track experiments by date ###################
     year="$(date "+%Y")"
@@ -168,78 +171,122 @@ setup_run_cmd() {
     # `export LAUNCH_WITH=deepspeeed && bash train_llama_alcf.sh`
     ##################################################################
     setupLauncher "${LAUNCH_WITH:-MPICH}" || exit
-    TBDIR="${CKPT_DIR}/tensorboard"
-    mkdir -p "${TBDIR}"
     export data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}"
     printf "\n"
     echo "Using data_cache_path: ${data_cache_path}"
-    export DEFAULTS="\
-        --split 100,0,0 \
-        --log-interval 1 \
-        --no-bias-gelu-fusion \
-        --no-bias-dropout-fusion \
-        --no-masked-softmax-fusion \
-        --no-gradient-accumulation-fusion \
-        --accumulate-allreduce-grads-in-fp32 \
-        --log-timers-to-tensorboard \
-        --log-optimizer-states-to-tensorboard"
-    OVERRIDE_CKPT_OPT_PARAM="${OVERRIDE_CKPT_OPT_PARAM:-}"
+    TRAIN_SPLIT="${TRAIN_SPLIT:-100}"
+    VAL_SPLIT="${VAL_SPLIT:-0}"
+    TEST_SPLIT="${TEST_SPLIT:-0}"
+    LOG_INTERVAL="${LOG_INTERVAL:-1}"
+    DEFAULTS=(
+        "--split ${TRAIN_SPLIT},${VAL_SPLIT},${TEST_SPLIT}"
+        "--log-interval ${LOG_INTERVAL}"
+        "--no-bias-gelu-fusion"
+        "--no-bias-dropout-fusion"
+        "--no-masked-softmax-fusion"
+        "--no-gradient-accumulation-fusion"
+        "--accumulate-allreduce-grads-in-fp32"
+    )
+    # export DEFAULTS="\
+    #     --split ${TRAIN_SPLIT},${VAL_SPLIT},${TEST_SPLIT} \
+    #     --log-interval ${LOG_INTERVAL} \
+    #     --no-bias-gelu-fusion \
+    #     --no-bias-dropout-fusion \
+    #     --no-masked-softmax-fusion \
+    #     --no-gradient-accumulation-fusion \
+    #     --accumulate-allreduce-grads-in-fp32"
+    # OVERRIDE_CKPT_OPT_PARAM="${OVERRIDE_CKPT_OPT_PARAM:-}"
     if [[ -z "${OVERRIDE_CKPT_OPT_PARAM:-}" ]]; then
-        DEFAULTS="${DEFAULTS} --use-checkpoint-opt_param-scheduler"
+        DEFAULTS+=("--use-checkpoint-opt_param-scheduler")
     fi
-    if [[ "${SP}" -ge 2 ]]; then
-        export DEFAULTS="${DEFAULTS} --ds-sequence-parallel-size ${SP} --force-ds-sequence-parallel"
+    if [[ "${SP}" -gt 1 ]]; then
+        DEFAULTS+=(
+        "--ds-sequence-parallel-size ${SP}"
+        "--force-ds-sequence-parallel"
+        )
     fi
     ##################################################################
     # WARN: to disable Llama-type architectures, toggle via:
     # `NO_LLAMA=1 bash train_llama_alcf.sh`
     ##################################################################
-    if [[ -z "${NO_LLAMA:-}" ]]; then
-        llama_flags="${LLAMA_ARGS}\
-            --num-key-value-heads ${NUM_KV_HEAD} \
-            --ffn-hidden-size ${FFN_HIDDEN_SIZE} \
-            "
+    LLAMA_ARGS=""
+    if [[ "${SP}" == 1 ]]; then
+        export LLAMA_ARGS="${LLAMA_ARGS} "
     else
-        echo "!! Running in NO_LLAMA MODE !!"
-        llama_flags=""
+        export LLAMA_ARGS=""
+        echo "NOT USING ROTARY EMBEDDINGS! LLAMA_ARGS=${LLAMA_ARGS}"
     fi
-    export run_cmd="
-        ${LAUNCHER} \
-        --${DTYPE} \
-        ${DEFAULTS} \
-        --optimizer ${OPT} \
-        --adam-beta1=${ADAM_BETA1} \
-        --adam-beta2=${ADAM_BETA2} \
-        --adam-eps=${ADAM_EPS} \
-        --weight-decay=${WEIGHT_DECAY} \
-        --save ${CKPT_DIR} \
-        --load ${CKPT_DIR} \
-        --seq-length ${SEQ} \
-        --num-layers ${NLAYERS} \
-        --hidden-size ${HIDDEN} \
-        --tensorboard-dir ${TBDIR} \
-        --train-iters ${TRAIN_ITERS} \
-        --eval-iters ${EVAL_ITERS} \
-        --distributed-backend ${BE} \
-        --num-attention-heads ${HEADS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --max-position-embeddings ${SEQ} \
-        --micro-batch-size ${MICRO_BATCH} \
-        --tensor-model-parallel-size ${TP} \
-        --global-batch-size ${GLOBAL_BATCH} \
-        --pipeline-model-parallel-size ${PP} \
-        --data-cache-path ${data_cache_path} \
-        ${DATA_FLAGS} \
-        ${LR_ARGS} \
-        ${llama_flags} \
-        ${FLASH_ARG} \
-        ${TIMING_STR} \
-        ${TOKENIZER_FLAGS} \
-        ${ds_args} \
-        ${gpt_args[*]} \
-        ${custom_args}
-        "
+    if [[ -z "${NO_LLAMA:-}" ]]; then
+        llama_flags=(
+			"--swiglu"
+            "--hidden-dropout 0"
+            "--attention-dropout 0"
+			"--normalization rmsnorm"
+			"--disable-bias-linear"
+            "--no-query-key-layer-scaling"
+			"--use-rotary-position-embeddings"
+			"--untie-embeddings-and-output-weights"
+            "--num-key-value-heads ${NUM_KV_HEAD}"
+            "--ffn-hidden-size ${FFN_HIDDEN_SIZE}"
+        )
+    fi
+
+    TENSORBARD_ARGS=()
+    if [[ -z "${USE_TENSORBARD:-}" ]]; then
+        TBDIR="${CKPT_DIR}/tensorboard"
+        mkdir -p "${TBDIR}"
+        # --log-timers-to-tensorboard \
+        # --log-optimizer-states-to-tensorboard"
+        # --tensorboard-dir ${TBDIR} \
+        TENSORBARD_ARGS+=(
+            "--log-timers-to-tensorboard"
+            "--log-optimizer-states-to-tensorboard"
+            "--tensorboard-dir ${TBDIR}"
+        )
+    fi
+    dfl_fallback="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/$(get_machine_name)/dolma.txt}"
+    export ADAM_BETA1="${ADAM_BETA1:-0.9}"
+    export ADAM_BETA2="${ADAM_BETA2:-0.95}"
+    export ADAM_EPS="${ADAM_EPS:-0.00001}" # 1 * 10^{-5}
+    export run_cmd=(
+        "${LAUNCHER}"
+        "--${DTYPE}"
+        "${DEFAULTS[@]}"
+        "--optimizer ${OPT}"
+        "--save ${CKPT_DIR}"
+        "--load ${CKPT_DIR}"
+        "--seq-length ${SEQ}"
+        "--num-layers ${NLAYERS}"
+        "--hidden-size ${HIDDEN}"
+        "--train-iters ${TRAIN_ITERS}"
+        "--eval-iters ${EVAL_ITERS}"
+        "--distributed-backend ${BE}"
+        "--adam-beta1 ${ADAM_BETA1:-0.9}"
+        "--adam-beta2 ${ADAM_BETA2:-0.95}"
+        "--adam-eps ${ADAM_EPS:-0.00001}"
+        "--clip-grad ${CLIP_GRAD:-1.0}"
+        "--weight-decay ${WEIGHT_DECAY:-0.1}"
+        "--num-attention-heads ${HEADS}"
+        "--save-interval ${SAVE_INTERVAL}"
+        "--eval-interval ${EVAL_INTERVAL}"
+        "--max-position-embeddings ${SEQ}"
+        "--micro-batch-size ${MICRO_BATCH}"
+        "--tensor-model-parallel-size ${TP}"
+        "--global-batch-size ${GLOBAL_BATCH}"
+        "--pipeline-model-parallel-size ${PP}"
+        "--data-cache-path ${data_cache_path}"
+        "--data-file-list ${DATA_FILE_LIST:-${dfl_fallback}}"
+        "${TENSORBARD_ARGS[@]}"
+        "${DATA_FLAGS}"
+        "${LR_ARGS}"
+        "${llama_flags[@]}"
+        "${FLASH_ARG}"
+        "${TIMING_STR}"
+        "${TOKENIZER_FLAGS}"
+        "${ds_args[@]}"
+        "${gpt_args[@]}"
+        "${custom_args[@]}"
+    )
 }
 
 save_dotenv() {
@@ -518,7 +565,6 @@ set_ccl_vars_on_aurora() {
 ##############################################################################
 setParams() {
     FLASH_ARG=""
-    LLAMA_ARGS="--attention-dropout 0 --hidden-dropout 0"
     # ---- [Parallelism Settings] -------------------------------------------+
     # ------ [Aurora] -------||------ [SunSpot] -------------
     # if [[ $(hostname) == x4* || $(hostname) == x1* ]]; then
@@ -600,9 +646,9 @@ setParams() {
     export FLASH_ARG="${FLASH_ARG}"
     export DTYPE="${DTYPE:-bf16}"
     export OPT="${OPT:-adamw}"
-    export ADAM_BETA1="${ADAM_BETA1:-0.9}"
-    export ADAM_BETA2="${ADAM_BETA2:-0.95}"
-    export ADAM_EPS="${ADAM_EPS:-0.00001}" # 1 * 10^{-5}
+    # export ADAM_BETA1="${ADAM_BETA1:-0.9}"
+    # export ADAM_BETA2="${ADAM_BETA2:-0.95}"
+    # export ADAM_EPS="${ADAM_EPS:-0.00001}" # 1 * 10^{-5}
     export WEIGHT_DECAY="${WEIGHT_DECAY:-0.1}"
     export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}"
     NHOSTS=$(wc -l <"${HOSTFILE}")
@@ -648,24 +694,19 @@ setParams() {
     #
     #   For this reason, we only use the default LLAMA_ARGS when SP=0.
     ##########################################################################
-    if [[ "${SP}" == 1 ]]; then
-        export LLAMA_ARGS="${LLAMA_ARGS} --no-query-key-layer-scaling --use-rotary-position-embeddings --untie-embeddings-and-output-weights --swiglu --normalization rmsnorm --disable-bias-linear"
-    else
-        export LLAMA_ARGS=""
-        echo "NOT USING ROTARY EMBEDDINGS! LLAMA_ARGS=${LLAMA_ARGS}"
-    fi
     # -----[Learning Rate Settings]--------------------------------------------
     export LR=${LR:-0.0003}                       # LEARNING_RATE
     export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP
     export LR_DECAY_ITERS=${LR_DECAY_ITERS:-}     # LR DECAY ITERS
     set_lr_args
     # -----[Learning Rate Settings]--------------------------------------------
-    if [[ "${TIMING_LOG_LEVEL}" -ge 1 ]]; then
+    # if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then
+    if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then
         TIMING_STR="\
-            --timing-log-level ${TIMING_LOG_LEVEL} \
-            --log-timers-to-tensorboard \
-            --log-optimizer-states-to-tensorboard \
-        "
+            --timing-log-level ${TIMING_LOG_LEVEL}"
+            # --log-timers-to-tensorboard \
+            # --log-optimizer-states-to-tensorboard \
+        # "
     else
         TIMING_STR=""
     fi
@@ -679,19 +720,31 @@ setParams() {
 ##############################################
 set_args() {
     # ---- Set DeepSpeed arguments --------------------------------
-    ds_args=" "
-    ds_args=" --deepspeed ${ds_args}"
-    if [[ $PP == 1 ]]; then
-        ds_args=" --no-pipeline-parallel ${ds_args}"
+    ds_args=(
+        "--deepspeed"
+    )
+    if [[ "${PP:-1}" == 1 ]]; then
+        ds_args+=("--no-pipeline-parallel")
     fi
-    ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
-    ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
+    ds_args+=("--deepspeed_config=${DS_CONFIG}")
+    ds_args+=("--zero-stage=$ZERO_STAGE")
     if [[ "${ZERO_STAGE}" == 3 ]]; then
-        ds_args="--use-mics ${ds_args}"
+        ds_args+=("--use-mics")
     fi
+    # ds_args=" "
+    # ds_args=" --deepspeed ${ds_args}"
+    # if [[ $PP == 1 ]]; then
+    #     ds_args=" --no-pipeline-parallel ${ds_args}"
+    # fi
+    # ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
+    # ds_args="--zero-stage=$ZERO_STAGE ${ds_args}"
+    # if [[ "${ZERO_STAGE}" == 3 ]]; then
+    #     ds_args="--use-mics ${ds_args}"
+    # fi
     if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then
         echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!"
-        ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
+        ds_args+=("--deepspeed-activation-checkpointing")
+        # ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
         # --checkpoint-activations \
         # --deepspeed-activation-checkpointing
     fi
@@ -804,7 +857,8 @@ get_output_prefix() {
     pre="${pre}_sp${SP}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}"
     pre="${pre}_lr${LR}_lwf${LR_WARMUP_FRAC}"
     if [[ -n "${TOKENIZER_TYPE:-}" ]]; then
-        pre="${pre}_tok${TOKENIZER_TYPE}"
+        _tok=$(echo "${TOKENIZER_TYPE}" | sed 's/Tokenizer//g')  # noqa
+        pre="${pre}_tok${_tok}"
     fi
     if [[ -n "${LR_DECAY_ITERS}" ]]; then
         pre="${pre}_ldi${LR_DECAY_ITERS}"
@@ -822,9 +876,21 @@ setOutput() {
     OUTPUT_DIR="logs/${OUTPUT_PREFIX}/$(date +%Y%m%d-%H%M%S)_${WORLD_SIZE}_${HOSTNAME}"
     export OUTPUT_DIR="${OUTPUT_DIR}" && mkdir -p "${OUTPUT_DIR}"
     export OUTPUT_LOG="${OUTPUT_DIR}/output.log"
-    export CKPT_DIR="checkpoints/${OUTPUT_PREFIX}"
     echo "${OUTPUT_LOG}" >>"logs/latest"
     printf "\n Please see logs at: %s\n" "$(printGreen "${OUTPUT_DIR}")"
+}
+
+get_checkpoint_dir() {
+    if [[ -n "${CKPT_DIR:-}" ]]; then
+        echo "${CKPT_DIR}"
+    else
+        echo "checkpoints/$(get_output_prefix)"
+    fi
+}
+
+setup_checkpoint() {
+    ckpt_dir=$(get_checkpoint_dir)
+    export CKPT_DIR="${ckpt_dir}"
     printf "Checkpoints will be saved to: %s\n" "$(printYellow "${CKPT_DIR}")"
 }
 
@@ -832,7 +898,7 @@ setOutput() {
 # Build DeepSpeed config and write to .json
 #############################################
 buildDSconfig() {
-    export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}"
+    # export CPU_OPTIMIZER="${CPU_OPTIMIZER:-0}"
     export DS_CONFIG="${WORKING_DIR}/ds-configs/ds_stage${ZERO_STAGE}_mb${MICRO_BATCH}_gb${GLOBAL_BATCH}_pp${PP}_${DTYPE}.json"
     mkdir -p "$(dirname "${DS_CONFIG}")"
     echo "DS_CONFIG: ${DS_CONFIG}"
@@ -893,31 +959,6 @@ install_dependencies() {
     fi
 }
 
-######################################################################
-# install_deepspeed_for_xpu
-#
-# Install microsoft/DeepSpeed on PVC
-#
-# This will:
-# 1. Clone rep
-# 2. Checkout appropriate branch
-# 3. Install into virtual environment
-######################################################################
-install_deepspeed_for_xpu() {
-    # python3 -m pip install "torch==2.1.0.post2" torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 oneccl_bind_pt==2.1.300+xpu --extra-index-url "https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
-    echo "Building + Installing DeepSpeed on $(hostname)"
-    outdir="${WORKING_DIR}/deps/DeepSpeed"
-    mkdir -p "${outdir}"
-    git clone https://github.com/microsoft/DeepSpeed.git "${outdir}"
-    cd "${outdir}" || exit
-    echo "[install_deepspeed_for_xpu] !! pwd: $(pwd)"
-    python3 -m pip install --require-virtualenv -r requirements/requirements.txt 1>/dev/null
-    python3 -m pip install xgboost "numpy<2" --force-reinstall --upgrade --require-virtualenv 1>/dev/null
-    python setup.py develop 1>/dev/null
-    cd "${WORKING_DIR}"
-    echo "[install_deepspeed_for_xpu] !! pwd: $(pwd)"
-}
-
 #################################################
 # Fix for distributed key value store on Aurora
 #################################################
@@ -1003,9 +1044,11 @@ setup_tokenizer_and_data() {
     fi
     echo "Setting up tokenizer with ${tok}"
     echo "Using data_file_list: ${dfl}"
+    _data_flags=()
+    _tokenizer_flags=()
     if [[ ${tok} == gpt* || ${tok} == GPT* ]]; then
         export TOKENIZER_TYPE="GPT2"
-        export TOKENIZER_FLAGS="--tokenizer-type GPT2BPETokenizer"
+        _tokenizer_flags+=("--tokenizer-type GPT2BPETokenizer")
         machine=$(get_machine_name)
         if [[ ${machine} == "polaris" ]]; then
             export DATA_PARENT="${DATA_PARENT:-/eagle/argonne_tpc/foremans/projects/argonne-lcf/Megatron-DeepSpeed/dataset}"
@@ -1019,18 +1062,25 @@ setup_tokenizer_and_data() {
         export VOCAB_FILE="${DATA_PARENT}/gpt2-vocab.json"
         export MERGE_FILE="${DATA_PARENT}/gpt2-merges.txt"
         export DATA_PATH="${DATA_PARENT}/BookCorpusDataset_text_document"
-        export DATA_FLAGS="--data-path ${DATA_PATH} --vocab-file ${VOCAB_FILE} --merge-file ${MERGE_FILE}"
+        _data_flags+=(
+            "--data-path ${DATA_PATH}"
+            "--vocab-file ${VOCAB_FILE}"
+            "--merge-file ${MERGE_FILE}"
+        )
     else
-        export DATA_FLAGS=""
-        export TOKENIZER_TYPE="Llama2"
+        export TOKENIZER_TYPE="${TOKENIZER_TYPE:-Llama2Tokenizer}"
         tm="${WORKING_DIR}/ALCF/tokenizer.model"           # fallback: Megatron-DeepSpeed/ALCF/tokenizer.model
         export TOKENIZER_MODEL="${TOKENIZER_MODEL:-${tm}}" # USE TOKENIZER_MODEL from env, else fallback from ^
-        export TOKENIZER_FLAGS="--tokenizer-type Llama2Tokenizer --tokenizer-model ${TOKENIZER_MODEL}"
-        if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then
-            echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST-}"
-            setData "${dfl}" || exit
-        fi
+        _tokenizer_flags+=(
+            "--tokenizer-type ${TOKENIZER_TYPE}"
+            "--tokenizer-model ${TOKENIZER_MODEL}"
+        )
+        # if [[ "${TOKENIZER_TYPE}" != "GPT2" ]]; then
+        echo "Using tokenizer: ${TOKENIZER_TYPE}. Setting up data with ${DATA_FILE_LIST:-}"
+        setData "${dfl}" || exit
     fi
+    export DATA_FLAGS="${_data_flags[*]}"
+    export TOKENIZER_FLAGS="${_tokenizer_flags[*]}"
     printf "[setData] DATA_FLAGS: %s\n" "$(printGreen "${DATA_FLAGS}")"
     printf "[setData] TOKENIZER_FLAGS: %s\n" "$(printMagenta "${TOKENIZER_FLAGS}")"
 }
@@ -1059,7 +1109,7 @@ setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST]
     export WEIGHT_SUM="${ws}"
     export DFL_STEM="${dfl_stem}"
     export DATA_CACHE_PATH="${dcp}"
-    export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST}" #  --data-cache-path ${DATA_CACHE_PATH}"
+    # export DATA_FLAGS="${DATA_FLAGS} --data-file-list ${DATA_FILE_LIST}"   #  --data-cache-path ${DATA_CACHE_PATH}"
     echo "--------------------"
     echo "Updated environment:"
     printf "DATA_FILE_LIST: %s\n" "${DATA_FILE_LIST}"
@@ -1071,6 +1121,30 @@ setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST]
     echo "--------------------"
 }
 
+generateDSconfig_new() {
+    cat <<EOT > "${CONFIG_JSON}"
+    {
+    "train_batch_size" : $GLOBAL_BATCH,
+    "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+    "steps_per_print": 1,
+
+    "zero_optimization": {
+        "stage": $ZERO_STAGE
+    },
+
+    "bf16": {
+        "enabled": true
+    },
+
+    "data_types": {
+            "grad_accum_dtype": "fp32" 
+    },
+
+    "wall_clock_breakdown" : false
+    }
+EOT
+}
+
 ################################################################################
 # generateDSconfig
 #
@@ -1089,16 +1163,6 @@ generateDSconfig() {
             exit 1
         fi
     done
-    # \"optimizer\": {
-    #   \"type\": \"AdamW\",
-    #   \"params\": {
-    #     \"lr\": ${LR},
-    #     \"beta1\": 0.9,
-    #     \"beta2\": 0.95,
-    #     \"eps\": 1e-5,
-    #     \"weight_decay\": 1e-1
-    #   }
-    # },
     # \"scheduler\": {
     #   \"type\": \"WarmupLR\",
     #   \"params\": {
@@ -1113,13 +1177,17 @@ generateDSconfig() {
         \"train_micro_batch_size_per_gpu\": $MICRO_BATCH,
         \"steps_per_print\": 1,
         \"gradient_accumulation_steps\": $GRAD_ACC_STEPS,
+        \"zero_force_ds_cpu_optimizer\": false,
         \"zero_allow_untested_optimizer\": true,
         \"gradient_clipping\": 1.0,
-        \"activation_checkpointing\": {
-          \"partition_activations\": true,
-          \"contiguous_memory_optimization\": true
-        },
         \"wall_clock_breakdown\": false,"
+    if [[ "${USE_ACTIVATION_CHECKPOINTING}" == 1 ]]; then
+        activation_checkpointing="\
+            \"activation_checkpointing\": {
+            \"partition_activations\": true,
+            \"contiguous_memory_optimization\": true
+            },"
+    fi
     flops_profiler="\
         \"flops_profiler\": {
           \"enabled\": true,
@@ -1160,6 +1228,20 @@ generateDSconfig() {
     else
         dtype="\"communication_data_type\": \"fp32\","
     fi
+    if [[ "${OPT:-adamw}" == "ds.adamw" ]]; then
+        optimizer="\
+            \"optimizer\": {
+                \"type\": \"AdamW\",
+                \"params\": {
+                \"lr\": ${LR},
+                \"beta1\": 0.9,
+                \"beta2\": 0.95,
+                \"eps\": 1e-5,
+                \"weight_decay\": 1e-1
+            },"
+    else
+        optimizer=""
+    fi
     if [[ "${ZERO_STAGE}" == 3 ]]; then
         # \"mics_shard_size\": 2,
         zero="\
@@ -1185,8 +1267,7 @@ generateDSconfig() {
             },"
     # elif [[ $ZERO_STAGE == 2 ]]; then
     elif [[ "${ZERO_STAGE}" == 2 || "${ZERO_STAGE}" == 1 ]]; then
-        # if [[ -n "${CPU_OPTIMIZER}" ]]; then
-        if [[ "${CPU_OPTIMIZER:-0}" != 0 ]]; then
+        if [[ -z "${CPU_OPTIMIZER:-}" ]]; then
             echo "!!!! CAUGHT CPU_OPTIMIZER !!!!"
             zero="\
                 \"zero_optimization\": {
@@ -1215,9 +1296,8 @@ generateDSconfig() {
         else
             extra="\
                 \"comms_logger\": {
-                \"enabled\": true,
+                \"enabled\": ${COMMS_LOGGER:-false},
                 \"verbose\": false,
-                \"prof_all\": true,
                 \"debug\": false
               },"
         fi
@@ -1227,6 +1307,7 @@ generateDSconfig() {
     cat <<EOT >"$1"
 {
 $common
+$optimizer
 $zero
 $dtype
 $extra
@@ -1304,6 +1385,73 @@ printWhite() {
     printf "\e[1;37m%s\e[0m\n" "$@"
 }
 
+reset_env() {
+    custom_vars=(
+        NO_FLASH_ATTN
+        TP
+        PP
+        SP
+        FLASH_ARG
+        OPT
+        ADAM_BETA1
+        ADAM_BETA2
+        ADAM_EPS
+        WEIGHT_DECAY
+        HEADS
+        NLAYERS
+        HIDDEN
+        NUM_KV_HEAD
+        FFN_HIDDEN_SIZE
+        SEQ
+        ZERO_STAGE
+        MICRO_BATCH
+        EVAL_ITERS
+        EVAL_INTERVAL
+        TIMING_LOG_LEVEL
+        ACT_CKPT_NUM_LAYERS
+        USE_ACTIVATION_CHECKPOINTING
+        GLOBAL_BATCH_MAX
+        GLOBAL_BATCH
+        TRAIN_TOKENS
+        TRAIN_ITERS
+        MODEL_TYPE
+        LLAMA_ARGS
+        LR
+        LR_WARMUP_FRAC
+        LR_DECAY_ITERS
+        LR_ARGS
+        CPU_OPTIMIZER
+        DS_CONFIG
+        OUTPUT_DIR
+        OUTPUT_LOG
+        CKPT_DIR
+        ds_args
+        EXEC
+        EXEC_STEM
+        DATA_FLAGS
+        TOKENIZER_TYPE
+        TOKENIZER_MODEL
+        TOKENIZER_FLAGS
+        DATA_FILE_LIST
+        NUM_DOCS
+        WEIGHT_SUM
+        DFL_STEM
+        DATA_CACHE_PATH
+        DOTENV_FILE
+        YEAR
+        MONTH
+        DAY
+        TODAY
+        STARTED_AT
+        LAUNCHER
+        data_cache_path
+        DEFAULTS
+    )
+    printf "Unsetting custom vars: %s\n" "${custom_vars[*]}"
+    unset "${custom_vars[@]}"
+}
+
+
 ###########################
 # call helpers_main()
 ###########################
diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh
index a6a2db72abb..286740fc89f 100644
--- a/train_aGPT_7B.sh
+++ b/train_aGPT_7B.sh
@@ -16,19 +16,20 @@ source "${HERE}/ALCF/helpers.sh" || exit
 
 # 3. call `setup` from `./ALCF/helpers.sh`
 setup "$@" || exit
-export run_cmd="${run_cmd}"
-echo "${run_cmd}" | tee -a "${OUTPUT_LOG}"
+# export run_cmd="${run_cmd}"
+echo "${run_cmd[@]}" | tee -a "${OUTPUT_LOG}"
 
 # 4. Tell user where to find output
 printf "[!! %s] View output at:\n %s\n" "$(printBlue "NOTE")" "$(printYellow "${OUTPUT_LOG}")" | tee -a "${OUTPUT_LOG}"
 
-# 5. Ignore the following strings on Intel XPU devices
-#    (otherwise they'll clutter up logs)
-XPU_IGNORE_STRING="CCL_WARN|\ -\ INFO\ \-\ |real_accelerator\.py|numexpr\.utils|async_io|libaio"
+# # 5. Ignore the following strings on Intel XPU devices
+# #    (otherwise they'll clutter up logs)
+# XPU_IGNORE_STRING="CCL_WARN|\ -\ INFO\ \-\ |real_accelerator\.py|numexpr\.utils|async_io|libaio"
 
 # if [[ $(ezpz_get_machine_name) == "aurora" ]]; then
 #     module unload mpich && module load mpich
 # fi
 #
 # 6. Evaluate ${run_cmd} and append outputs to ${OUTPUT_LOG}
-eval "${run_cmd}" |& grep -E -v "${XPU_IGNORE_STRING}" |& tee -a "${OUTPUT_LOG}"
+# eval "${run_cmd[@]}" |& tee -a "${OUTPUT_LOG}"
+eval "${run_cmd[*]}" |& tee -a "${OUTPUT_LOG}"

From 43cde2b8f10171735dce68b899bd4a43f3158bfc Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Sun, 13 Oct 2024 10:15:49 -0500
Subject: [PATCH 10/31] Update `pretrain_gpt_alcf.py`

---
 pretrain_gpt_alcf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py
index 12a05c52998..3686c6ceeb7 100644
--- a/pretrain_gpt_alcf.py
+++ b/pretrain_gpt_alcf.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 
 """Pretrain GPT"""
+
 import time
 from typing import Callable
 from mpi4py import MPI
@@ -103,7 +104,7 @@ def model_provider(pre_process=True, post_process=True):
     with deepspeed_zero_init(
         data_parallel_group=dpg,
         remote_device=(None if args.remote_device == "none" else args.remote_device),
-        config_dict_or_path=args.deepspeed_config_dict,
+        config_dict_or_path=args.deepspeed_config,  # _dict,
         enabled=args.zero_stage == 3,
         mpu=mpu,
     ):

From 9f097339de3f9dacd856b4649fb696c5d0591010 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Sun, 13 Oct 2024 10:16:22 -0500
Subject: [PATCH 11/31] Update
 `megatron/data/{blendable,gpt,indexed}_dataset.py`

---
 megatron/data/blendable_dataset.py |  24 +-
 megatron/data/gpt_dataset.py       | 885 +++++++++++++++++++----------
 megatron/data/indexed_dataset.py   | 217 ++++---
 3 files changed, 713 insertions(+), 413 deletions(-)

diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py
index ba2e00b1efe..590a379971d 100755
--- a/megatron/data/blendable_dataset.py
+++ b/megatron/data/blendable_dataset.py
@@ -6,14 +6,20 @@
 import os
 import time
 
+import logging
 import numpy as np
 import torch
 
 from deepspeed.accelerator import get_accelerator
-from megatron import print_rank_0
+# from megatron import print_rank_0
 from megatron.core import mpu
 from megatron.utils import Profile, PerfTrace
 from mpi4py import MPI
+
+from megatron.utils import get_logger
+
+log = get_logger(__name__, rank_zero_only=True)
+
 dlp = Profile("DATASET")
 class BlendableDataset(torch.utils.data.Dataset):
     @dlp.log
@@ -43,7 +49,7 @@ def _build_indices():
             helpers.build_blending_indices(dataset_index, dataset_sample_index,
                                            weights, num_datasets, self.size,
                                            torch.distributed.get_rank() == 0)
-            print_rank_0('> elapsed time for building blendable dataset indices: '
+            log.info('> elapsed time for building blendable dataset indices: '
                          '{:.2f} (sec)'.format(time.time() - start_time))
             return dataset_index, dataset_sample_index
 
@@ -68,7 +74,7 @@ def _build_indices():
                       ' dataset, building indices on rank 0 ...', flush=True)
                 dataset_index, dataset_sample_index = _build_indices()
                 try:
-                    print_rank_0(" > saving index map files")
+                    log.info(" > saving index map files")
                     start_time = time.time()
                     os.makedirs(os.path.dirname(index_path), exist_ok=True)
                     with open(desc_path, 'wt') as fd:
@@ -76,7 +82,7 @@ def _build_indices():
                         np.save(index_path, dataset_index, allow_pickle=True)
                         np.save(sample_index_path, dataset_sample_index,
                                 allow_pickle=True)
-                    print_rank_0(f" > finished saving index map files in {time.time() - start_time} seconds")
+                    log.info(f" > finished saving index map files in {time.time() - start_time} seconds")
                 except OSError:
                     print(f'There was an error trying to create the data cache directory ({data_cache_path})')
                     print('or a file in it. This is set with the --data-cache-path argument. Please')
@@ -93,7 +99,7 @@ def _build_indices():
                     torch.distributed.get_world_size() //
                     torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) //
                     torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())):
-                print_rank_0("Data index creation unsuccessful, exiting.")
+                log.info("Data index creation unsuccessful, exiting.")
                 exit()
             '''
             torch.distributed.barrier(group=mpu.get_data_parallel_group())
@@ -101,13 +107,13 @@ def _build_indices():
             torch.distributed.barrier(group=mpu.get_data_parallel_group())
             
             start_time = time.time()
-            print_rank_0(f'> loading blendable dataset index: {index_path}')
+            log.info(f'> loading blendable dataset index: {index_path}')
             self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r')
             assert self.dataset_index.size == self.size
-            print_rank_0(f'> loading blendable dataset sample index: {sample_index_path}')
+            log.info(f'> loading blendable dataset sample index: {sample_index_path}')
             self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r')
             assert self.dataset_sample_index.size == self.size
-            print_rank_0(f'> finished loading in {time.time() - start_time} seconds')            
+            log.info(f'> finished loading in {time.time() - start_time} seconds')            
         else:
             self.dataset_index, self.dataset_sample_index = _build_indices()
 
@@ -119,7 +125,7 @@ def _build_indices():
             raise RuntimeError('BlendedDataset size is improperly bounded')
         except IndexError:
             pass
-        print_rank_0('> size of blendable dataset: '
+        log.info('> size of blendable dataset: '
                      '{} samples'.format(self.size))
 
 
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 8501324752e..0a01ea31caa 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -9,67 +9,96 @@
 import numpy as np
 import torch
 from deepspeed.accelerator import get_accelerator
-from megatron import print_rank_0, is_rank_0, get_args
+from megatron import is_rank_0, get_args
 from megatron.core import mpu
-from megatron.data import helpers
+from megatron.data import helpers  # type:ignore
 from megatron.data.blendable_dataset import BlendableDataset
-from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_datasets_corpuses_weights_and_num_samples
+from megatron.data.dataset_utils import (
+    get_datasets_weights_and_num_samples,
+    get_datasets_corpuses_weights_and_num_samples,
+)
 from megatron.data.dataset_utils import get_train_valid_test_split_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 
-from megatron.utils import PerfTrace, Profile
+from megatron.utils import PerfTrace, Profile, get_logger
 from mpi4py import MPI
 
 dlp = Profile("DATASET")
 
+log = get_logger(__name__, rank_zero_only=True)
+
+
 @dlp.log
-def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
-                                    train_valid_test_num_samples,
-                                    seq_length, seed, skip_warmup,
-                                    train_data_prefix=None,
-                                    valid_data_prefix=None,
-                                    test_data_prefix=None,
-                                    return_doc_ids=False, *,
-                                    data_cache_path=None):
+def build_train_valid_test_datasets(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_valid_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    train_data_prefix=None,
+    valid_data_prefix=None,
+    test_data_prefix=None,
+    return_doc_ids=False,
+    *,
+    data_cache_path=None,
+):
     """Build train, valid, and test datasets."""
 
     if data_prefix:
-        print_rank_0("Single data path provided for train, valid & test")
+        log.debug("Single data path provided for train, valid & test")
 
         # Single dataset.
         if len(data_prefix) == 1:
-            return _build_train_valid_test_datasets(data_prefix[0],
-                                                    data_impl, splits_string,
-                                                    train_valid_test_num_samples,
-                                                    seq_length, seed, skip_warmup,
-                                                    data_cache_path=data_cache_path)
+            return _build_train_valid_test_datasets(
+                data_prefix[0],
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples,
+                seq_length,
+                seed,
+                skip_warmup,
+                data_cache_path=data_cache_path,
+            )
 
         # Blending dataset.
         # Parse the values.
-        output = get_datasets_corpuses_weights_and_num_samples(data_prefix,
-                                                      train_valid_test_num_samples)
+        output = get_datasets_corpuses_weights_and_num_samples(
+            data_prefix, train_valid_test_num_samples
+        )
         prefixes, corpuses, weights, datasets_train_valid_test_num_samples = output
         corpus_list = sorted(set(corpuses))
         train_num_samples, valid_num_samples, test_num_samples = map(
-            sum,
-            zip(*datasets_train_valid_test_num_samples)
+            sum, zip(*datasets_train_valid_test_num_samples)
         )
 
         class DatasetBuilder:
-            ''' 
+            """
             This is for building individual dataset from each dataset file
-            '''
+            """
+
             @dlp.log
-            def __init__(self, prefix, corpus, data_impl, splits_string,
-                         num_samples, seq_length, seed, skip_warmup,
-                         return_doc_ids,
-                         data_cache_path=data_cache_path, name='train'):
+            def __init__(
+                self,
+                prefix,
+                corpus,
+                data_impl,
+                splits_string,
+                num_samples,
+                seq_length,
+                seed,
+                skip_warmup,
+                return_doc_ids,
+                data_cache_path=data_cache_path,
+                name="train",
+            ):
                 self.prefix = prefix
                 self.data_impl = data_impl
                 self.splits_string = splits_string
-                if name == 'train':
+                if name == "train":
                     self.num_samples = num_samples[0]
-                elif name == 'valid':
+                elif name == "valid":
                     self.num_samples = num_samples[1]
                 else:
                     self.num_samples = num_samples[2]
@@ -84,11 +113,21 @@ def __init__(self, prefix, corpus, data_impl, splits_string,
                 self.desc = prefix + f"{self.num_samples}" + f"{seq_length}" + f"{seed}"
                 self.build = False
                 self.corpus = corpus
+
             @dlp.log
             def Build(self):
-                self.dataset = _build_train_valid_test_datasets_single(self.prefix, self.data_impl, self.splits_string,
-                    self.num_samples_train_valid_test, self.seq_length, self.seed, self.skip_warmup, self.name, self.return_doc_ids, 
-                    data_cache_path=self.data_cache_path)
+                self.dataset = _build_train_valid_test_datasets_single(
+                    self.prefix,
+                    self.data_impl,
+                    self.splits_string,
+                    self.num_samples_train_valid_test,
+                    self.seq_length,
+                    self.seed,
+                    self.skip_warmup,
+                    self.name,
+                    self.return_doc_ids,
+                    data_cache_path=self.data_cache_path,
+                )
                 self.build = True
                 return self.dataset
 
@@ -98,21 +137,27 @@ def __init__(self, dataset_builders):
                 self.dataset_builders = dataset_builders
                 self.num_datasets = len(dataset_builders)
                 self.num_samples = np.sum([d.num_samples for d in dataset_builders])
-                self.indices=np.zeros((self.num_samples, 2), dtype=np.uint64)
-                self.desc="ConcatDataset:"
-                m = 0
+                self.indices = np.zeros((self.num_samples, 2), dtype=np.uint64)
+                self.desc = "ConcatDataset:"
+                # m = 0
                 num_samples_list = np.array([d.num_samples for d in dataset_builders])
                 self.num_samples = np.sum(num_samples_list)
+
                 def _build_indices():
                     start_time = time.time()
                     dataset_index = np.zeros(self.num_samples, dtype=np.int64)
                     dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64)
-                    helpers.build_concat_indices(dataset_index, dataset_sample_index,
-                                                 num_samples_list, 
-                                                 self.num_datasets, 
-                                                 torch.distributed.get_rank()==0)
-                    print_rank_0('> elapsed time for building concat dataset indices: '
-                                 '{:.2f} (sec)'.format(time.time() - start_time))
+                    helpers.build_concat_indices(
+                        dataset_index,
+                        dataset_sample_index,
+                        num_samples_list,
+                        self.num_datasets,
+                        torch.distributed.get_rank() == 0,
+                    )
+                    log.debug(
+                        "> elapsed time for building concat dataset indices: "
+                        "{:.2f} (sec)".format(time.time() - start_time)
+                    )
                     return dataset_index, dataset_sample_index
 
                 self.dataset_index, self.dataset_sample_index = _build_indices()
@@ -122,7 +167,12 @@ def _build_indices():
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 
-                self.desc += f"-{self.num_samples}" + f"-{dataset_builders[0].seq_length}" + f"{dataset_builders[0].seed}"
+                self.desc += (
+                    f"-{self.num_samples}"
+                    + f"-{dataset_builders[0].seq_length}"
+                    + f"{dataset_builders[0].seed}"
+                )
+
             def __len__(self):
                 return self.num_samples
 
@@ -135,227 +185,340 @@ def __getitem__(self, idx):
                     return self.dataset_builders[i].dataset[j]
                 else:
                     return self.dataset_builders[i].Build()[j]
-            
 
-        # Predetermine whether need to build the specific dataset or not. 
+        # Predetermine whether need to build the specific dataset or not.
         start_time = time.time()
-        print_rank_0(" >>> Started building datasets in distributed way ... ")
+        log.debug(" >>> Started building datasets in distributed way ... ")
 
         a, b, c = [int(d) for d in splits_string.split(",")]
-        
+
         train_datasets = []
         valid_datasets = []
         test_datasets = []
         # Build individual datasets.
 
         @dlp.log
-        def build_corpus_datasets(dataset_type='train'):
+        def build_corpus_datasets(dataset_type="train"):
             start_time = time.time()
-            print_rank_0(f" >>> Building {dataset_type} corpus datasets ...")
+            log.debug(f" >>> Building {dataset_type} corpus datasets ...")
             datasets = []
             corpus_builders = {}
             corpus_weights = {}
             for c in corpus_list:
                 corpus_builders[c] = []
                 corpus_weights[c] = 0.0
-            dataset_builders = [DatasetBuilder(prefixes[i], corpuses[i], data_impl, splits_string,
-                                               datasets_train_valid_test_num_samples[i],
-                                               seq_length, seed, skip_warmup,
-                                               return_doc_ids,data_cache_path, dataset_type) for i in  range(len(weights))]
-            for i in range(torch.distributed.get_rank()//mpu.get_tensor_model_parallel_world_size(), len(weights),  torch.distributed.get_world_size()//mpu.get_tensor_model_parallel_world_size()):
+            dataset_builders = [
+                DatasetBuilder(
+                    prefixes[i],
+                    corpuses[i],
+                    data_impl,
+                    splits_string,
+                    datasets_train_valid_test_num_samples[i],
+                    seq_length,
+                    seed,
+                    skip_warmup,
+                    return_doc_ids,
+                    data_cache_path,
+                    dataset_type,
+                )
+                for i in range(len(weights))
+            ]
+            for i in range(
+                torch.distributed.get_rank()
+                // mpu.get_tensor_model_parallel_world_size(),
+                len(weights),
+                torch.distributed.get_world_size()
+                // mpu.get_tensor_model_parallel_world_size(),
+            ):
                 dataset_builders[i].Build()
-            print_rank_0(f" >>> Finished building individual datasets in {time.time() - start_time} seconds")
+            log.debug(
+                f" >>> Finished building individual datasets in {time.time() - start_time} seconds"
+            )
             start_concating_time = time.time()
             for i, d in zip(range(len(weights)), dataset_builders):
                 corpus_builders[d.corpus].append(d)
                 corpus_weights[d.corpus] += weights[i]
             total = 0
-            print_rank_0(" > number of samples for each corpus ")
-            corpus_weights_achieved={}
+            log.debug(" > number of samples for each corpus ")
+            corpus_weights_achieved = {}
             for c in corpus_list:
                 datasets.append(BuildConcatDataset(corpus_builders[c]))
                 total += datasets[-1].num_samples
-                corpus_weights_achieved[c] =  float(datasets[-1].num_samples)/train_num_samples                
-                print_rank_0(f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})")
-            
-            print_rank_0(f" > total number of samples: {total}")
-            print_rank_0(f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds")
-            print_rank_0(f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds")
+                corpus_weights_achieved[c] = (
+                    float(datasets[-1].num_samples) / train_num_samples
+                )
+                log.debug(
+                    f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})"
+                )
+
+            log.debug(f" > total number of samples: {total}")
+            log.debug(
+                f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds"
+            )
+            log.debug(
+                f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds"
+            )
             return datasets, [corpus_weights_achieved[c] for c in corpus_list]
 
+        train_weights = None
         if a > 0:
-            train_datasets, train_weights = build_corpus_datasets('train')
-
+            train_datasets, train_weights = build_corpus_datasets("train")
+        valid_weights = None
         if b > 0:
-            valid_datasets, valid_weights = build_corpus_datasets('valid')
-            
-        if c > 0:            
-            test_datasets, test_weights = build_corpus_datasets('test')
+            valid_datasets, valid_weights = build_corpus_datasets("valid")
+        test_weights = None
+        if c > 0:
+            test_datasets, test_weights = build_corpus_datasets("test")
 
         # This barrier is critical to make sure that all the datasets are built once
         # and the metadata were written to the cache folder before other ranks touch them
-        print_rank_0(f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds")
+        log.debug(
+            f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds"
+        )
         torch.distributed.barrier(group=mpu.get_data_parallel_group())
         torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group())
         torch.distributed.barrier(group=mpu.get_data_parallel_group())
-        print_rank_0(f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds")
-        print_rank_0(f" >>> Starting to build BlendableDataset")
+        log.debug(
+            f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds"
+        )
+        log.debug(" >>> Starting to build BlendableDataset")
         # Blend.
         start_time = time.time()
         blending_train_dataset = None
-        if train_datasets:
-            blending_train_dataset = BlendableDataset(train_datasets, train_weights, train_num_samples,
-                                                      data_cache_path=data_cache_path)
+        if train_datasets and train_weights:
+            blending_train_dataset = BlendableDataset(
+                train_datasets,
+                train_weights,
+                train_num_samples,
+                data_cache_path=data_cache_path,
+            )
         blending_valid_dataset = None
-        if valid_datasets:
-            blending_valid_dataset = BlendableDataset(valid_datasets, valid_weights, valid_num_samples,
-                                                      data_cache_path=data_cache_path)
+        if valid_datasets and valid_weights:
+            blending_valid_dataset = BlendableDataset(
+                valid_datasets,
+                valid_weights,
+                valid_num_samples,
+                data_cache_path=data_cache_path,
+            )
         blending_test_dataset = None
-        if test_datasets:
-            blending_test_dataset = BlendableDataset(test_datasets, test_weights, test_num_samples,
-                                                     data_cache_path=data_cache_path)
+        if test_datasets and test_weights:
+            blending_test_dataset = BlendableDataset(
+                test_datasets,
+                test_weights,
+                test_num_samples,
+                data_cache_path=data_cache_path,
+            )
         end_time = time.time()
-        print_rank_0(f" >>> Finished building BlendableDataset in {end_time - start_time} seconds")
-        return (blending_train_dataset, blending_valid_dataset,
-                blending_test_dataset)
+        log.debug(
+            f" >>> Finished building BlendableDataset in {end_time - start_time} seconds"
+        )
+        return (blending_train_dataset, blending_valid_dataset, blending_test_dataset)
 
     else:
-        print_rank_0("Separate data paths provided for train, valid & test. Split string will be ignored.")
+        log.debug(
+            "Separate data paths provided for train, valid & test. Split string will be ignored."
+        )
 
         train_dataset, valid_dataset, test_dataset = None, None, None
         # Single dataset.
         if train_data_prefix is not None:
-            train_dataset = build_dataset("train", train_data_prefix, data_impl,
-                                          splits_string,
-                                          train_valid_test_num_samples[0],
-                                          seq_length, seed, skip_warmup,
-                                          data_cache_path=data_cache_path)
+            train_dataset = build_dataset(
+                "train",
+                train_data_prefix,
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples[0],
+                seq_length,
+                seed,
+                skip_warmup,
+                data_cache_path=data_cache_path,
+            )
 
         if valid_data_prefix is not None:
-            valid_dataset = build_dataset("valid", valid_data_prefix, data_impl,
-                                          splits_string,
-                                          train_valid_test_num_samples[1],
-                                          seq_length, seed, False,
-                                          data_cache_path=data_cache_path)
-
+            valid_dataset = build_dataset(
+                "valid",
+                valid_data_prefix,
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples[1],
+                seq_length,
+                seed,
+                False,
+                data_cache_path=data_cache_path,
+            )
 
         if test_data_prefix is not None:
-            test_dataset = build_dataset("test", test_data_prefix, data_impl,
-                                         splits_string,
-                                         train_valid_test_num_samples[2],
-                                         seq_length, seed, False,
-                                         data_cache_path=data_cache_path)
+            test_dataset = build_dataset(
+                "test",
+                test_data_prefix,
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples[2],
+                seq_length,
+                seed,
+                False,
+                data_cache_path=data_cache_path,
+            )
 
         return (train_dataset, valid_dataset, test_dataset)
 
+
 @dlp.log
-def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
-                                     train_valid_test_num_samples,
-                                     seq_length, seed, skip_warmup,
-                                     return_doc_ids=False, *,
-                                     data_cache_path=None):
+def _build_train_valid_test_datasets(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_valid_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    return_doc_ids=False,
+    *,
+    data_cache_path=None,
+):
     """Build train, valid, and test datasets."""
 
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
-    print_rank_0(' > dataset split:')
+    log.debug(" > dataset split:")
 
     def print_split_stats(name, index):
-        print_rank_0('    {}:'.format(name))
-        print_rank_0('     document indices in [{}, {}) total of {} '
-                     'documents'.format(splits[index], splits[index + 1],
-                                        splits[index + 1] - splits[index]))
-    print_split_stats('train', 0)
-    print_split_stats('validation', 1)
-    print_split_stats('test', 2)
+        log.debug("    {}:".format(name))
+        log.debug(
+            "     document indices in [{}, {}) total of {} " "documents".format(
+                splits[index], splits[index + 1], splits[index + 1] - splits[index]
+            )
+        )
+
+    print_split_stats("train", 0)
+    print_split_stats("validation", 1)
+    print_split_stats("test", 2)
 
     def build_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index + 1],
-                                  step=1, dtype=np.int32)
-            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
-                                 splits_string,
-                                 train_valid_test_num_samples[index],
-                                 seq_length, seed,
-                                 return_doc_ids,
-                                 data_cache_path=data_cache_path)
+            documents = np.arange(
+                start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
+            )
+            dataset = GPTDataset(
+                name,
+                data_prefix,
+                documents,
+                indexed_dataset,
+                splits_string,
+                train_valid_test_num_samples[index],
+                seq_length,
+                seed,
+                return_doc_ids,
+                data_cache_path=data_cache_path,
+            )
         return dataset
 
-    train_dataset = build_dataset(0, 'train')
-    valid_dataset = build_dataset(1, 'valid')
-    test_dataset = build_dataset(2, 'test')
+    train_dataset = build_dataset(0, "train")
+    valid_dataset = build_dataset(1, "valid")
+    test_dataset = build_dataset(2, "test")
 
     return (train_dataset, valid_dataset, test_dataset)
 
+
 @dlp.log
-def _build_train_valid_test_datasets_single(data_prefix, data_impl, splits_string,
-                            train_valid_test_num_samples,
-                            seq_length, seed, skip_warmup, name, 
-                            return_doc_ids=False, *,
-                            data_cache_path=None):
+def _build_train_valid_test_datasets_single(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_valid_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    name,
+    return_doc_ids=False,
+    *,
+    data_cache_path=None,
+):
     """Build train, valid, and test datasets."""
 
     # Each rank print out information
-    print_rank_0(f" >> building dataset for {data_prefix}")
+    log.debug(f" >> building dataset for {data_prefix}")
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
-    print_rank_0(' > dataset split:')
+    log.debug(" > dataset split:")
 
     def print_split_stats(name, index):
-        print_rank_0('    {}:'.format(name))
-        print_rank_0('     document indices in [{}, {}) total of {} '
-                     'documents'.format(splits[index], splits[index + 1],
-                                        splits[index + 1] - splits[index]))
-    print_split_stats('train', 0)
-    print_split_stats('validation', 1)
-    print_split_stats('test', 2)
+        log.debug("    {}:".format(name))
+        log.debug(
+            "     document indices in [{}, {}) total of {} " "documents".format(
+                splits[index], splits[index + 1], splits[index + 1] - splits[index]
+            )
+        )
+
+    print_split_stats("train", 0)
+    print_split_stats("validation", 1)
+    print_split_stats("test", 2)
 
     def build_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index + 1],
-                                  step=1, dtype=np.int32)
-            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
-                                 splits_string,
-                                 train_valid_test_num_samples[index],
-                                 seq_length, seed,
-                                 return_doc_ids,
-                                 data_cache_path=data_cache_path)
+            documents = np.arange(
+                start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
+            )
+            dataset = GPTDataset(
+                name,
+                data_prefix,
+                documents,
+                indexed_dataset,
+                splits_string,
+                train_valid_test_num_samples[index],
+                seq_length,
+                seed,
+                return_doc_ids,
+                data_cache_path=data_cache_path,
+            )
         return dataset
-    if name.find("train")!=-1:
-        return build_dataset(0, 'train')
-    if name.find("valid")!=-1:
-        return build_dataset(1, 'valid')
-    if name.find("test")!=-1:
-        return build_dataset(2, 'test')
+
+    if name.find("train") != -1:
+        return build_dataset(0, "train")
+    if name.find("valid") != -1:
+        return build_dataset(1, "valid")
+    if name.find("test") != -1:
+        return build_dataset(2, "test")
+
 
 @dlp.log
-def build_dataset(dataset_name, data_prefix, data_impl,
-                  splits_string, num_samples,
-                  seq_length, seed, skip_warmup,
-                  *,
-                  data_cache_path=None):
+def build_dataset(
+    dataset_name,
+    data_prefix,
+    data_impl,
+    splits_string,
+    num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    *,
+    data_cache_path=None,
+):
     dataset = None
     if len(data_prefix) == 1:
-        dataset = _build_dataset(dataset_name, data_prefix[0], data_impl,
-                                 splits_string, num_samples, seq_length,
-                                 seed, skip_warmup,
-                                 data_cache_path=data_cache_path)
+        dataset = _build_dataset(
+            dataset_name,
+            data_prefix[0],
+            data_impl,
+            splits_string,
+            num_samples,
+            seq_length,
+            seed,
+            skip_warmup,
+            data_cache_path=data_cache_path,
+        )
     else:
         # Blending dataset.
         # Parse the values.
@@ -366,73 +529,108 @@ def build_dataset(dataset_name, data_prefix, data_impl,
         # Build individual datasets.
         datasets = []
         for i in range(len(prefixes)):
-            ds = _build_dataset(dataset_name, prefixes[i], data_impl,
-                                splits_string, dataset_num_samples[i],
-                                seq_length, seed, skip_warmup,
-                                data_cache_path=data_cache_path)
+            ds = _build_dataset(
+                dataset_name,
+                prefixes[i],
+                data_impl,
+                splits_string,
+                dataset_num_samples[i],
+                seq_length,
+                seed,
+                skip_warmup,
+                data_cache_path=data_cache_path,
+            )
             if ds:
                 datasets.append(ds)
 
         if datasets:
-            dataset = BlendableDataset(datasets, weights, num_samples,
-                                       data_cache_path=data_cache_path)
+            dataset = BlendableDataset(
+                datasets, weights, num_samples, data_cache_path=data_cache_path
+            )
 
     return dataset
 
+
 @dlp.log
-def _build_dataset(dataset_name, data_prefix, data_impl, splits_string,
-                   num_samples, seq_length, seed, skip_warmup,
-                   *,
-                   data_cache_path=None):
+def _build_dataset(
+    dataset_name,
+    data_prefix,
+    data_impl,
+    splits_string,
+    num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    *,
+    data_cache_path=None,
+):
     """
     Build dataset. This method is called when individual
     train, valid, test datasets are provided
     """
 
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
 
-    print_rank_0('    {}:'.format(dataset_name))
-    print_rank_0('     document indices in [0, {}) total of {} '
-                 'documents'.format(total_num_of_documents, total_num_of_documents))
-
-    documents = np.arange(start=0, stop=total_num_of_documents,
-                        step=1, dtype=np.int32)
-
-    dataset = GPTDataset(dataset_name, data_prefix, documents, indexed_dataset,
-                         splits_string, num_samples, seq_length, seed,
-                         data_cache_path=data_cache_path)
+    log.debug("    {}:".format(dataset_name))
+    log.debug(
+        "     document indices in [0, {}) total of {} " "documents".format(
+            total_num_of_documents, total_num_of_documents
+        )
+    )
+
+    documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32)
+
+    dataset = GPTDataset(
+        dataset_name,
+        data_prefix,
+        documents,
+        indexed_dataset,
+        splits_string,
+        num_samples,
+        seq_length,
+        seed,
+        data_cache_path=data_cache_path,
+    )
 
     return dataset
 
+
 @dlp.log
 def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
     """Build indexed dataset."""
-    print_rank_0(' > building dataset index ...')
+    log.debug(" > building dataset index ...")
 
     start_time = time.time()
-    indexed_dataset = make_indexed_dataset(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-    print_rank_0(' > finished creating indexed dataset in {:4f} '
-                 'seconds'.format(time.time() - start_time))
-    print_rank_0('    number of documents: {}'.format(
-        indexed_dataset.sizes.shape[0]))
+    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
+    log.debug(
+        " > finished creating indexed dataset in {:4f} " "seconds".format(
+            time.time() - start_time
+        )
+    )
+    log.debug("    number of documents: {}".format(indexed_dataset.sizes.shape[0]))
 
     return indexed_dataset
 
 
 class GPTDataset(torch.utils.data.Dataset):
     @dlp.log
-    def __init__(self, name, data_prefix, documents, indexed_dataset,
-                 splits_string, num_samples, seq_length, seed,
-                 return_doc_ids=False, *,
-                 data_cache_path=None):
-
+    def __init__(
+        self,
+        name,
+        data_prefix,
+        documents,
+        indexed_dataset,
+        splits_string,
+        num_samples,
+        seq_length,
+        seed,
+        return_doc_ids=False,
+        *,
+        data_cache_path=None,
+    ):
         self.name = name
         self.indexed_dataset = indexed_dataset
         self.return_doc_ids = return_doc_ids
@@ -442,20 +640,29 @@ def __init__(self, name, data_prefix, documents, indexed_dataset,
         assert np.max(documents) < indexed_dataset.sizes.shape[0]
 
         # Build index mappings.
-        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = \
-            _build_index_mappings(self.name, data_prefix,
-                                  documents, self.indexed_dataset.sizes,
-                                  splits_string, num_samples, seq_length, seed,
-                                  data_cache_path=data_cache_path)
-
+        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = (
+            _build_index_mappings(
+                self.name,
+                data_prefix,
+                documents,
+                self.indexed_dataset.sizes,
+                splits_string,
+                num_samples,
+                seq_length,
+                seed,
+                data_cache_path=data_cache_path,
+            )
+        )
 
     def __len__(self):
         # -1 is due to data structure used to retieve the index:
         #    sample i --> [sample_idx[i], sample_idx[i+1])
         return self.sample_idx.shape[0] - 1
+
     @dlp.log
     def __getitem__(self, idx):
         args = get_args()
+        assert args is not None
         orig_idx = idx
         # Get the shuffled index.
         try:
@@ -464,21 +671,24 @@ def __getitem__(self, idx):
             if is_rank_0():
                 import json
                 from rich import print_json
+
                 print(exc)
                 print(
-                    '\n'.join(
-                        ['-------------------------------------------------',
-                         f'Trying to access {idx=} from self.shuffle_idx,',
-                         f'but {len(self.shuffle_idx)=}',
-                         '-------------------------------------------------']
+                    "\n".join(
+                        [
+                            "-------------------------------------------------",
+                            f"Trying to access {idx=} from self.shuffle_idx,",
+                            f"but {len(self.shuffle_idx)=}",
+                            "-------------------------------------------------",
+                        ]
                     )
                 )
                 print_json(
                     json.dumps(
                         {
-                            'doc_idx': len(self.doc_idx),
-                            'sample_idx': len(self.sample_idx),
-                            'shuffle_idx': len(self.shuffle_idx),
+                            "doc_idx": len(self.doc_idx),
+                            "sample_idx": len(self.sample_idx),
+                            "shuffle_idx": len(self.shuffle_idx),
                         },
                         indent=4,
                     )
@@ -492,45 +702,57 @@ def __getitem__(self, idx):
         doc_ids = []
         if doc_index_f == doc_index_l:
             doc_ids.append(self.doc_idx[doc_index_f])
-            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                              offset=offset_f,
-                                              length=offset_l - offset_f + 1)
+            sample = self.indexed_dataset.get(
+                self.doc_idx[doc_index_f],
+                offset=offset_f,
+                length=offset_l - offset_f + 1,
+            )
         else:
             # Otherwise, get the rest of the initial document.
             doc_ids.append(self.doc_idx[doc_index_f])
-            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                                    offset=offset_f)]
+            sample_list = [
+                self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
+            ]
             # Loop over all in between documents and add the entire document.
             for i in range(doc_index_f + 1, doc_index_l):
                 doc_ids.append(self.doc_idx[i])
                 sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
             # And finally add the relevant portion of last document.
             doc_ids.append(self.doc_idx[doc_index_l])
-            sample_list.append(self.indexed_dataset.get(
-                self.doc_idx[doc_index_l],
-                length=offset_l + 1))
+            sample_list.append(
+                self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
+            )
             sample = np.concatenate(sample_list)
 
-        text_name = 'text'
+        text_name = "text"
         if args.use_dataset_only:
-            text_name = 'input_ids'
+            text_name = "input_ids"
         sample_dict = {text_name: np.array(sample, dtype=np.int64)}
         if args.return_data_index:
-            sample_dict.update({'index': np.array([orig_idx], dtype=np.int64)})
+            sample_dict.update({"index": np.array([orig_idx], dtype=np.int64)})
 
-        if self.return_doc_ids: # for retro preprocessing
-            sample_dict.update({'doc_ids': np.array(doc_ids, dtype=np.int64)})
+        if self.return_doc_ids:  # for retro preprocessing
+            sample_dict.update({"doc_ids": np.array(doc_ids, dtype=np.int64)})
 
         if args.use_dataset_only:
-            sample_dict.update({'labels': np.array(sample, dtype=np.int64)})
+            sample_dict.update({"labels": np.array(sample, dtype=np.int64)})
 
         return sample_dict
 
+
 @dlp.log
-def _build_index_mappings(name, data_prefix, documents, sizes,
-                          splits_string, num_samples, seq_length, seed,
-                          *,
-                          data_cache_path):
+def _build_index_mappings(
+    name,
+    data_prefix,
+    documents,
+    sizes,
+    splits_string,
+    num_samples,
+    seq_length,
+    seed,
+    *,
+    data_cache_path,
+):
     """Build doc-idx, sample-idx, and shuffle-idx.
     doc-idx: is an array (ordered) of documents to be used in training.
     sample-idx: is the start document index and document offset for each
@@ -538,10 +760,11 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     shuffle-idx: maps the sample index into a random index into sample-idx.
     """
     args = get_args()
+    assert args is not None
     # Number of tokens in each epoch and number of required epochs.
     tokens_per_epoch = _num_tokens(documents, sizes)
     num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
-    if args.train_data_exact_num_epochs is not None and name == 'train':
+    if args.train_data_exact_num_epochs is not None and name == "train":
         num_epochs = args.train_data_exact_num_epochs
 
     # rng state
@@ -556,13 +779,13 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     desc += f"Sequence length {seq_length}\n"
     desc += f"Random seed {seed}\n"
     desc += f"Split {splits_string}\n"
-    desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
+    desc_hash = hashlib.md5(desc.encode("utf-8")).hexdigest()
     desc_filename = desc_hash + ".dsc"
-    doc_idx_filename = desc_hash + '_doc_idx.npy'
-    sample_idx_filename = desc_hash + '_sample_idx.npy'
-    shuffle_idx_filename = desc_hash + '_shuffle_idx.npy'
+    doc_idx_filename = desc_hash + "_doc_idx.npy"
+    sample_idx_filename = desc_hash + "_sample_idx.npy"
+    shuffle_idx_filename = desc_hash + "_shuffle_idx.npy"
 
-    if name == 'train':
+    if name == "train":
         # force to use certain index files
         if args.train_desc_path is not None:
             desc_filename = args.train_desc_path
@@ -577,15 +800,15 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     # duplication, then look in data-cache-path if specified,
     # If nothing is found, use the last path looked in
     build_indices = True
-    prefixes = [os.path.join(os.path.dirname(data_prefix), 'index-cache')]
+    prefixes = [os.path.join(os.path.dirname(data_prefix), "index-cache")]
     if data_cache_path is not None:
         prefixes.append(data_cache_path)
     for prefix in prefixes:
         idx_path = {
-            'desc': os.path.join(prefix, desc_filename),
-            'doc': os.path.join(prefix, doc_idx_filename),
-            'sample': os.path.join(prefix, sample_idx_filename),
-            'shuffle': os.path.join(prefix, shuffle_idx_filename)
+            "desc": os.path.join(prefix, desc_filename),
+            "doc": os.path.join(prefix, doc_idx_filename),
+            "sample": os.path.join(prefix, sample_idx_filename),
+            "shuffle": os.path.join(prefix, shuffle_idx_filename),
         }
         for f in idx_path.values():
             if not os.path.isfile(f):
@@ -594,15 +817,17 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
             # Found our files!
             build_indices = False
             break
-    data_cache_dir = os.path.dirname(idx_path['desc'])
+    data_cache_dir = os.path.dirname(idx_path["desc"])
     data_cache_success = True
 
     # Build the indexed mapping if not exist.
     if build_indices:
-        # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the 
-        # ranks will first create the document files, and then read it. 
+        # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the
+        # ranks will first create the document files, and then read it.
         # There will not be contension effects going on either
-        print_rank_0(f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}")
+        log.warning(
+            f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}"
+        )
 
         # For the last epoch, decide whether include the entire epoch
         # in the global shuffle or not.
@@ -611,64 +836,80 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
         # not mean anything.
         if num_epochs == 1:
             separate_last_epoch = False
-            print_rank_0(' > only one epoch required, setting '
-                  'separate_last_epoch to False')
+            log.debug(
+                " > only one epoch required, setting " "separate_last_epoch to False"
+            )
 
         else:
             # Get the number of samples for the last epoch
             num_samples_from_epochs_minus_one = (
-                (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
-            last_epoch_num_samples = num_samples - \
-                                     num_samples_from_epochs_minus_one
-            assert last_epoch_num_samples >= 0, \
-                'last epoch number of samples should be non-negative.'
+                (num_epochs - 1) * tokens_per_epoch - 1
+            ) // seq_length
+            last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one
+            assert (
+                last_epoch_num_samples >= 0
+            ), "last epoch number of samples should be non-negative."
             num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
-            assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \
-                'last epoch number of samples exceeded max value.'
+            assert last_epoch_num_samples <= (
+                num_samples_per_epoch + 1
+            ), "last epoch number of samples exceeded max value."
             # If we have less than 80% of the samples for the last epoch,
             # seperate out the epoch and treat it differently.
             # Note: the 80% number is just based on common sense and can
             # be adjusted if needed.
-            separate_last_epoch = (last_epoch_num_samples <
-                                   int(0.80 * num_samples_per_epoch))
+            separate_last_epoch = last_epoch_num_samples < int(
+                0.80 * num_samples_per_epoch
+            )
             if separate_last_epoch:
-                string = ' > last epoch number of samples ({}) is smaller '\
-                         'than 80% of number of samples per epoch ({}), '\
-                         'setting separate_last_epoch to True'
+                string = (
+                    " > last epoch number of samples ({}) is smaller "
+                    "than 80% of number of samples per epoch ({}), "
+                    "setting separate_last_epoch to True"
+                )
             else:
-                string = ' > last epoch number of samples ({}) is larger '\
-                         'than 80% of number of samples per epoch ({}), '\
-                         'setting separate_last_epoch to False'
-            print_rank_0(string.format(last_epoch_num_samples,
-                                num_samples_per_epoch))
-
+                string = (
+                    " > last epoch number of samples ({}) is larger "
+                    "than 80% of number of samples per epoch ({}), "
+                    "setting separate_last_epoch to False"
+                )
+            log.debug(string.format(last_epoch_num_samples, num_samples_per_epoch))
 
         try:
             os.makedirs(data_cache_dir, exist_ok=True)
 
             # description
-            with open(idx_path['desc'], 'wt') as fd:
+            with open(idx_path["desc"], "wt") as fd:
                 fd.write(desc)
 
             # doc-idx.
             start_time = time.time()
-            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
-                                     separate_last_epoch)
-            np.save(idx_path['doc'], doc_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save doc-idx mapping '
-                         '(seconds): {:4f}'.format(time.time() - start_time))
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch)
+            np.save(idx_path["doc"], doc_idx, allow_pickle=True)
+            log.debug(
+                " > elasped time to build and save doc-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
             # sample-idx.
             start_time = time.time()
             # Use C++ implementation for speed.
             # First compile and then import.
             from megatron.data import helpers
+
             assert doc_idx.dtype == np.int32
             assert sizes.dtype == np.int32
-            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
-                                                  num_epochs, tokens_per_epoch, torch.distributed.get_rank()==0)
-            np.save(idx_path['sample'], sample_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save sample-idx mapping '
-                         '(seconds): {:4f}'.format(time.time() - start_time))
+            sample_idx = helpers.build_sample_idx(
+                sizes,
+                doc_idx,
+                seq_length,
+                num_epochs,
+                tokens_per_epoch,
+                torch.distributed.get_rank() == 0,
+            )
+            np.save(idx_path["sample"], sample_idx, allow_pickle=True)
+            log.debug(
+                " > elasped time to build and save sample-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
             # shuffle-idx.
             start_time = time.time()
             # -1 is due to data structure used to retieve the index:
@@ -677,35 +918,46 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
                 num_samples_ = num_samples_from_epochs_minus_one
             else:
                 num_samples_ = sample_idx.shape[0] - 1
-            shuffle_idx = _build_shuffle_idx(num_samples_,
-                                             sample_idx.shape[0] - 1, np_rng)
-            np.save(idx_path['shuffle'], shuffle_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save shuffle-idx mapping'
-                         ' (seconds): {:4f}'.format(time.time() - start_time))
+            shuffle_idx = _build_shuffle_idx(
+                num_samples_, sample_idx.shape[0] - 1, np_rng
+            )
+            np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True)
+            log.debug(
+                " > elasped time to build and save shuffle-idx mapping"
+                " (seconds): {:4f}".format(time.time() - start_time)
+            )
         except OSError:
-            print(f'There was an error trying to create the data cache directory ({data_cache_dir})')
-            print('or a file in it. This defaults to a directory "index-cache" within the directory')
-            print('the data files are in and can be set with the --data-cache-path argument. Please')
-            print('ensure you have write access to this directory or specify one that you do have')
-            print('write access to.')
+            print(
+                f"There was an error trying to create the data cache directory ({data_cache_dir})"
+            )
+            print(
+                'or a file in it. This defaults to a directory "index-cache" within the directory'
+            )
+            print(
+                "the data files are in and can be set with the --data-cache-path argument. Please"
+            )
+            print(
+                "ensure you have write access to this directory or specify one that you do have"
+            )
+            print("write access to.")
             data_cache_success = False
 
     # Load mappings.
     start_time = time.time()
-    print_rank_0(f" > loading doc-idx mapping from {idx_path['doc']}")
-    doc_idx = np.load(idx_path['doc'], allow_pickle=True, mmap_mode='r')
+    log.debug(f" > loading doc-idx mapping from {idx_path['doc']}")
+    doc_idx = np.load(idx_path["doc"], allow_pickle=True, mmap_mode="r")
 
-    print_rank_0(f" > loading sample-idx mapping from {idx_path['sample']}")
-    sample_idx = np.load(idx_path['sample'], allow_pickle=True, mmap_mode='r')
+    log.debug(f" > loading sample-idx mapping from {idx_path['sample']}")
+    sample_idx = np.load(idx_path["sample"], allow_pickle=True, mmap_mode="r")
 
-    print_rank_0(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
-    shuffle_idx = np.load(idx_path['shuffle'], allow_pickle=True, mmap_mode='r')
+    log.debug(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
+    shuffle_idx = np.load(idx_path["shuffle"], allow_pickle=True, mmap_mode="r")
 
-    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
-        time.time() - start_time))
-    print_rank_0('    total number of samples: {}'.format(
-        sample_idx.shape[0]))
-    print_rank_0('    total number of epochs: {}'.format(num_epochs))
+    log.debug(
+        "    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time)
+    )
+    log.debug("    total number of samples: {}".format(sample_idx.shape[0]))
+    log.debug("    total number of epochs: {}".format(num_epochs))
 
     return doc_idx, sample_idx, shuffle_idx, desc, desc_hash
 
@@ -729,25 +981,26 @@ def _num_epochs(tokens_per_epoch, seq_length, num_samples):
         if ((total_tokens - 1) // seq_length) >= num_samples:
             return num_epochs
 
+
 @dlp.log
 def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
     """Build an array with length = number-of-epochs * number-of-dcuments.
     Each index is mapped to a corresponding document."""
     if not separate_last_epoch or num_epochs == 1:
-        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
+        doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
         doc_idx[:] = documents
         doc_idx = doc_idx.reshape(-1)
         doc_idx = doc_idx.astype(np.int32)
         np_rng.shuffle(doc_idx)
         return doc_idx
 
-    doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False)
+    doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False)
     doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
     return np.concatenate((doc_idx_first, doc_idx_last))
 
+
 @dlp.log
-def _build_sample_idx(sizes, doc_idx, seq_length,
-                      num_epochs, tokens_per_epoch):
+def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
     """Sample index mapping is a 2D array with sizes
     [number-of-samples + 1, 2] where [..., 0] contains
     the index into `doc_idx` and [..., 1] is the
@@ -781,7 +1034,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
             # Note that -1 here is for the same reason we have -1 in
             # `_num_epochs` calculations.
             if remaining_seq_length <= 0:
-                doc_offset += (remaining_seq_length + doc_length - 1)
+                doc_offset += remaining_seq_length + doc_length - 1
                 remaining_seq_length = 0
             else:
                 # Otherwise, start from the begining of the next document.
@@ -794,24 +1047,28 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
 
     return sample_idx
 
+
 @dlp.log
 def _build_shuffle_idx(num_samples, total_size, np_rng):
     """Build the range [0, size) and shuffle."""
-    print_rank_0(' > building shuffle index with split [0, {}) and [{}, {}) '
-          '...'.format(num_samples, num_samples, total_size))
+    log.debug(
+        " > building shuffle index with split [0, {}) and [{}, {}) " "...".format(
+            num_samples, num_samples, total_size
+        )
+    )
 
     dtype_ = np.uint32
     if total_size >= (np.iinfo(np.uint32).max - 1):
         dtype_ = np.int64
 
-    shuffle_idx_first = np.arange(start=0, stop=num_samples,
-                                  step=1, dtype=dtype_)
+    shuffle_idx_first = np.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
     np_rng.shuffle(shuffle_idx_first)
     if num_samples == total_size:
         return shuffle_idx_first
 
-    shuffle_idx_last = np.arange(start=num_samples, stop=total_size,
-                                 step=1, dtype=dtype_)
+    shuffle_idx_last = np.arange(
+        start=num_samples, stop=total_size, step=1, dtype=dtype_
+    )
     np_rng.shuffle(shuffle_idx_last)
 
     return np.concatenate((shuffle_idx_first, shuffle_idx_last))
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index 1eb9b7842bb..8479f33fab3 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -15,17 +15,24 @@
 
 from functools import lru_cache
 import os
+
+# import logging
 import shutil
 import struct
 from itertools import accumulate
 
 import numpy as np
 import torch
-from megatron import print_rank_0
-from megatron.utils import Profile
+
+# from megatron import print_rank_0
+from megatron.utils import Profile, get_logger
+
+log = get_logger(__name__)
+
 
 dlp = Profile("DATASET")
 
+
 def __best_fitting_dtype(vocab_size=None):
     if vocab_size is not None and vocab_size < 65500:
         return np.uint16
@@ -34,28 +41,32 @@ def __best_fitting_dtype(vocab_size=None):
 
 
 def get_available_dataset_impl():
-    return ['lazy', 'cached', 'mmap']
+    return ["lazy", "cached", "mmap"]
 
 
 def infer_dataset_impl(path):
     if IndexedDataset.exists(path):
-        with open(index_file_path(path), 'rb') as f:
+        with open(index_file_path(path), "rb") as f:
             magic = f.read(8)
             if magic == IndexedDataset._HDR_MAGIC:
-                return 'cached'
+                return "cached"
             elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
-                return 'mmap'
+                return "mmap"
             else:
                 return None
     else:
         print(f"Dataset does not exist: {path}")
-        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        print(
+            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
+        )
         return None
 
 
 def make_builder(out_file, impl, vocab_size=None):
-    if impl == 'mmap':
-        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
+    if impl == "mmap":
+        return MMapIndexedDatasetBuilder(
+            out_file, dtype=__best_fitting_dtype(vocab_size)
+        )
     else:
         return IndexedDatasetBuilder(out_file)
 
@@ -63,22 +74,24 @@ def make_builder(out_file, impl, vocab_size=None):
 def make_dataset(path, impl, skip_warmup=False):
     if not IndexedDataset.exists(path):
         print(f"Dataset does not exist: {path}")
-        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        print(
+            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
+        )
         return None
-    if impl == 'infer':
+    if impl == "infer":
         impl = infer_dataset_impl(path)
-    if impl == 'lazy' and IndexedDataset.exists(path):
+    if impl == "lazy" and IndexedDataset.exists(path):
         return IndexedDataset(path)
-    elif impl == 'cached' and IndexedDataset.exists(path):
+    elif impl == "cached" and IndexedDataset.exists(path):
         return IndexedCachedDataset(path)
-    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+    elif impl == "mmap" and MMapIndexedDataset.exists(path):
         return MMapIndexedDataset(path, skip_warmup)
     print(f"Unknown dataset implementation: {impl}")
     return None
 
 
 def dataset_exists(path, impl):
-    if impl == 'mmap':
+    if impl == "mmap":
         return MMapIndexedDataset.exists(path)
     else:
         return IndexedDataset.exists(path)
@@ -114,11 +127,11 @@ def code(dtype):
 
 
 def index_file_path(prefix_path):
-    return prefix_path + '.idx'
+    return prefix_path + ".idx"
 
 
 def data_file_path(prefix_path):
-    return prefix_path + '.bin'
+    return prefix_path + ".bin"
 
 
 def create_doc_idx(sizes):
@@ -131,38 +144,41 @@ def create_doc_idx(sizes):
 
 class IndexedDataset(torch.utils.data.Dataset):
     """Loader for IndexedDataset"""
-    _HDR_MAGIC = b'TNTIDX\x00\x00'
+
+    _HDR_MAGIC = b"TNTIDX\x00\x00"
 
     def __init__(self, path):
         super().__init__()
         self.path = path
         self.data_file = None
         self.read_index(path)
+
     @dlp.log
     def read_index(self, path):
-        with open(index_file_path(path), 'rb') as f:
+        with open(index_file_path(path), "rb") as f:
             magic = f.read(8)
             assert magic == self._HDR_MAGIC, (
-                'Index file doesn\'t match expected format. '
-                'Make sure that --dataset-impl is configured properly.'
+                "Index file doesn't match expected format. "
+                "Make sure that --dataset-impl is configured properly."
             )
             version = f.read(8)
-            assert struct.unpack('<Q', version) == (1,)
-            code, self.element_size = struct.unpack('<QQ', f.read(16))
+            assert struct.unpack("<Q", version) == (1,)
+            code, self.element_size = struct.unpack("<QQ", f.read(16))
             self.dtype = dtypes[code]
-            self._len, self.s = struct.unpack('<QQ', f.read(16))
-            self.doc_count = struct.unpack('<Q', f.read(8))
+            self._len, self.s = struct.unpack("<QQ", f.read(16))
+            self.doc_count = struct.unpack("<Q", f.read(8))
             self.dim_offsets = read_longs(f, self._len + 1)
             self.data_offsets = read_longs(f, self._len + 1)
             self.sizes = read_longs(f, self.s)
             self.doc_idx = read_longs(f, self.doc_count)
+
     @dlp.log
     def read_data(self, path):
-        self.data_file = open(data_file_path(path), 'rb', buffering=0)
+        self.data_file = open(data_file_path(path), "rb", buffering=0)
 
     def check_index(self, i):
         if i < 0 or i >= self._len:
-            raise IndexError('index out of range')
+            raise IndexError("index out of range")
 
     def __del__(self):
         if self.data_file:
@@ -176,7 +192,7 @@ def __getitem__(self, idx):
         if isinstance(idx, int):
             i = idx
             self.check_index(i)
-            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
             a = np.empty(tensor_size, dtype=self.dtype)
             self.data_file.seek(self.data_offsets[i] * self.element_size)
             self.data_file.readinto(a)
@@ -185,7 +201,7 @@ def __getitem__(self, idx):
             start, stop, step = idx.indices(len(self))
             if step != 1:
                 raise ValueError("Slices into indexed_dataset must be contiguous")
-            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
+            sizes = self.sizes[self.dim_offsets[start] : self.dim_offsets[stop]]
             size = sum(sizes)
             a = np.empty(size, dtype=self.dtype)
             self.data_file.seek(self.data_offsets[start] * self.element_size)
@@ -205,8 +221,8 @@ def size(self, index):
 
     @staticmethod
     def exists(path):
-        return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        return os.path.exists(index_file_path(path)) and os.path.exists(
+            data_file_path(path)
         )
 
     @property
@@ -215,7 +231,6 @@ def supports_prefetch(self):
 
 
 class IndexedCachedDataset(IndexedDataset):
-
     def __init__(self, path):
         super().__init__(path)
         self.cache = None
@@ -224,6 +239,7 @@ def __init__(self, path):
     @property
     def supports_prefetch(self):
         return True
+
     @dlp.log
     def prefetch(self, indices):
         if all(i in self.cache_index for i in indices):
@@ -240,7 +256,7 @@ def prefetch(self, indices):
         for i in indices:
             self.cache_index[i] = ptx
             size = self.data_offsets[i + 1] - self.data_offsets[i]
-            a = self.cache[ptx: ptx + size]
+            a = self.cache[ptx : ptx + size]
             self.data_file.seek(self.data_offsets[i] * self.element_size)
             self.data_file.readinto(a)
             ptx += size
@@ -255,10 +271,10 @@ def __getitem__(self, idx):
         if isinstance(idx, int):
             i = idx
             self.check_index(i)
-            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
+            tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
             a = np.empty(tensor_size, dtype=self.dtype)
             ptx = self.cache_index[i]
-            np.copyto(a, self.cache[ptx: ptx + a.size])
+            np.copyto(a, self.cache[ptx : ptx + a.size])
             return a
         elif isinstance(idx, slice):
             # Hack just to make this work, can optimizer later if necessary
@@ -278,15 +294,17 @@ class IndexedDatasetBuilder(object):
         np.float32: 4,
         np.float64: 8,
     }
+
     @dlp.log
     def __init__(self, out_file, dtype=np.int32):
-        self.out_file = open(out_file, 'wb')
+        self.out_file = open(out_file, "wb")
         self.dtype = dtype
         self.data_offsets = [0]
         self.dim_offsets = [0]
         self.sizes = []
         self.element_size = self.element_sizes[self.dtype]
         self.doc_idx = [0]
+
     @dlp.log
     def add_item(self, tensor):
         bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
@@ -297,6 +315,7 @@ def add_item(self, tensor):
 
     def end_document(self):
         self.doc_idx.append(len(self.sizes))
+
     @dlp.log
     def merge_file_(self, another_file):
         index = IndexedDataset(another_file)
@@ -315,7 +334,7 @@ def merge_file_(self, another_file):
 
         self.doc_idx.extend((doc_offset + index.doc_idx)[1:])
 
-        with open(data_file_path(another_file), 'rb') as f:
+        with open(data_file_path(another_file), "rb") as f:
             while True:
                 data = f.read(1024)
                 if data:
@@ -325,21 +344,22 @@ def merge_file_(self, another_file):
 
     def finalize(self, index_file):
         self.out_file.close()
-        index = open(index_file, 'wb')
-        index.write(b'TNTIDX\x00\x00')
-        index.write(struct.pack('<Q', 1))
-        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
-        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
-        index.write(struct.pack('<Q', len(self.doc_idx)))
+        index = open(index_file, "wb")
+        index.write(b"TNTIDX\x00\x00")
+        index.write(struct.pack("<Q", 1))
+        index.write(struct.pack("<QQ", code(self.dtype), self.element_size))
+        index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack("<Q", len(self.doc_idx)))
         write_longs(index, self.dim_offsets)
         write_longs(index, self.data_offsets)
         write_longs(index, self.sizes)
         write_longs(index, self.doc_idx)
         index.close()
 
+
 @dlp.log
 def _warmup_mmap_file(path):
-    with open(path, 'rb') as stream:
+    with open(path, "rb") as stream:
         while stream.read(100 * 1024 * 1024):
             pass
 
@@ -378,17 +398,17 @@ def get_pointers_with_total(sizes, elemsize, dtype):
 
 class MMapIndexedDataset(torch.utils.data.Dataset):
     class Index(object):
-        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+        _HDR_MAGIC = b"MMIDIDX\x00\x00"
 
         @classmethod
         def writer(cls, path, dtype):
             class _Writer(object):
                 def __enter__(self):
-                    self._file = open(path, 'wb')
+                    self._file = open(path, "wb")
 
                     self._file.write(cls._HDR_MAGIC)
-                    self._file.write(struct.pack('<Q', 1))
-                    self._file.write(struct.pack('<B', code(dtype)))
+                    self._file.write(struct.pack("<Q", 1))
+                    self._file.write(struct.pack("<B", code(dtype)))
 
                     return self
 
@@ -401,24 +421,27 @@ def _get_pointers(sizes, npdtype):
                     """
 
                     # compute element sizes in bytes
-                    pointers, _ = get_pointers_with_total(sizes, dtype().itemsize, npdtype)
+                    pointers, _ = get_pointers_with_total(
+                        sizes, dtype().itemsize, npdtype
+                    )
                     return pointers
+
                 @dlp.log
                 def write(self, sizes, doc_idx):
-                    self._file.write(struct.pack('<Q', len(sizes)))
-                    self._file.write(struct.pack('<Q', len(doc_idx)))
+                    self._file.write(struct.pack("<Q", len(sizes)))
+                    self._file.write(struct.pack("<Q", len(doc_idx)))
 
                     sizes32 = np.array(sizes, dtype=np.int32)
-                    self._file.write(sizes32.tobytes(order='C'))
+                    self._file.write(sizes32.tobytes(order="C"))
                     del sizes32
 
                     pointers = self._get_pointers(sizes, np.int64)
                     del sizes
-                    self._file.write(pointers.tobytes(order='C'))
+                    self._file.write(pointers.tobytes(order="C"))
                     del pointers
 
                     doc_idx = np.array(doc_idx, dtype=np.int64)
-                    self._file.write(doc_idx.tobytes(order='C'))
+                    self._file.write(doc_idx.tobytes(order="C"))
 
                 def __exit__(self, exc_type, exc_val, exc_tb):
                     self._file.close()
@@ -427,41 +450,47 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
         @dlp.log
         def __init__(self, path, skip_warmup=False):
-            with open(path, 'rb') as stream:
+            with open(path, "rb") as stream:
                 magic_test = stream.read(9)
                 assert self._HDR_MAGIC == magic_test, (
-                    'Index file doesn\'t match expected format. '
-                    'Make sure that --dataset-impl is configured properly.'
+                    "Index file doesn't match expected format. "
+                    "Make sure that --dataset-impl is configured properly."
                 )
-                version = struct.unpack('<Q', stream.read(8))
+                version = struct.unpack("<Q", stream.read(8))
                 assert (1,) == version
 
-                dtype_code, = struct.unpack('<B', stream.read(1))
+                (dtype_code,) = struct.unpack("<B", stream.read(1))
                 self._dtype = dtypes[dtype_code]
                 self._dtype_size = self._dtype().itemsize
 
-                self._len = struct.unpack('<Q', stream.read(8))[0]
-                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                self._len = struct.unpack("<Q", stream.read(8))[0]
+                self._doc_count = struct.unpack("<Q", stream.read(8))[0]
                 offset = stream.tell()
 
             if not skip_warmup:
-                print_rank_0("    warming up index mmap file...")
+                log.debug("    warming up index mmap file...")
                 _warmup_mmap_file(path)
 
-            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
             self._bin_buffer = memoryview(self._bin_buffer_mmap)
-            print_rank_0("    reading sizes...")
+            log.debug("    reading sizes...")
             self._sizes = np.frombuffer(
+                self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
+            )
+            log.debug("    reading pointers...")
+            self._pointers = np.frombuffer(
                 self._bin_buffer,
-                dtype=np.int32,
+                dtype=np.int64,
                 count=self._len,
-                offset=offset)
-            print_rank_0("    reading pointers...")
-            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
-                                           offset=offset + self._sizes.nbytes)
-            print_rank_0("    reading document index...")
-            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
-                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+                offset=offset + self._sizes.nbytes,
+            )
+            log.debug("    reading document index...")
+            self._doc_idx = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64,
+                count=self._doc_count,
+                offset=offset + self._sizes.nbytes + self._pointers.nbytes,
+            )
 
         def __del__(self):
             self._bin_buffer_mmap._mmap.close()
@@ -507,12 +536,14 @@ def _do_init(self, path, skip_warmup):
         self._index = self.Index(index_file_path(self._path), skip_warmup)
 
         if not skip_warmup:
-            print_rank_0("    warming up data mmap file...")
+            log.debug("    warming up data mmap file...")
             _warmup_mmap_file(data_file_path(self._path))
-        print_rank_0("    creating numpy buffer of mmap...")
-        print_rank_0(data_file_path(self._path))
-        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
-        print_rank_0("    creating memory view of numpy buffer...")
+        log.debug("    creating numpy buffer of mmap...")
+        log.debug(data_file_path(self._path))
+        self._bin_buffer_mmap = np.memmap(
+            data_file_path(self._path), mode="r", order="C"
+        )
+        log.debug("    creating memory view of numpy buffer...")
         self._bin_buffer = memoryview(self._bin_buffer_mmap)
 
     def __del__(self):
@@ -528,8 +559,9 @@ def __len__(self):
     def __getitem__(self, idx):
         if isinstance(idx, (int, np.integer)):
             ptr, size = self._index[idx]
-            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                     count=size, offset=ptr)
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
+            )
             return np_array
         elif isinstance(idx, slice):
             start, stop, step = idx.indices(len(self))
@@ -539,8 +571,9 @@ def __getitem__(self, idx):
             sizes = self._index._sizes[idx]
             offsets = list(accumulate(sizes))
             total_size = sum(sizes)
-            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                     count=total_size, offset=ptr)
+            np_array = np.frombuffer(
+                self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr
+            )
             sents = np.split(np_array, offsets[:-1])
             return sents
         else:
@@ -548,7 +581,7 @@ def __getitem__(self, idx):
 
     @dlp.log
     def get(self, idx, offset=0, length=None):
-        """ Retrieves a single item from the dataset with the option to only
+        """Retrieves a single item from the dataset with the option to only
         return a portion of the item.
 
         get(idx) is the same as [idx] but get() does not support slicing.
@@ -557,8 +590,9 @@ def get(self, idx, offset=0, length=None):
         if length is None:
             length = size - offset
         ptr += offset * np.dtype(self._index.dtype).itemsize
-        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                 count=length, offset=ptr)
+        np_array = np.frombuffer(
+            self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr
+        )
         return np_array
 
     @property
@@ -584,8 +618,8 @@ def supports_prefetch(self):
 
     @staticmethod
     def exists(path):
-        return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        return os.path.exists(index_file_path(path)) and os.path.exists(
+            data_file_path(path)
         )
 
     @property
@@ -595,24 +629,27 @@ def dtype(self):
 
 class MMapIndexedDatasetBuilder(object):
     def __init__(self, out_file, dtype=np.int64):
-        self._data_file = open(out_file, 'wb')
+        self._data_file = open(out_file, "wb")
         self._dtype = dtype
         self._sizes = []
         self._doc_idx = [0]
+
     @dlp.log
     def add_item(self, tensor):
         np_array = np.array(tensor.numpy(), dtype=self._dtype)
-        self._data_file.write(np_array.tobytes(order='C'))
+        self._data_file.write(np_array.tobytes(order="C"))
         self._sizes.append(np_array.size)
+
     @dlp.log
     def add_doc(self, tensor, sizes):
         np_array = np.array(tensor, dtype=self._dtype)
-        self._data_file.write(np_array.tobytes(order='C'))
+        self._data_file.write(np_array.tobytes(order="C"))
         self._sizes.extend(sizes)
         self._doc_idx.append(len(self._sizes))
 
     def end_document(self):
         self._doc_idx.append(len(self._sizes))
+
     @dlp.log
     def merge_file_(self, another_file):
         # Concatenate index
@@ -624,7 +661,7 @@ def merge_file_(self, another_file):
         self._doc_idx.extend((offset + index.doc_idx)[1:])
 
         # Concatenate data
-        with open(data_file_path(another_file), 'rb') as f:
+        with open(data_file_path(another_file), "rb") as f:
             shutil.copyfileobj(f, self._data_file)
 
     def finalize(self, index_file):

From 2b31b4449ded2d6823316cc2a628fd950e2f2e8b Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Sun, 13 Oct 2024 10:16:44 -0500
Subject: [PATCH 12/31] Update `ALCF/requirements/requirements.txt`

---
 ALCF/requirements/requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ALCF/requirements/requirements.txt b/ALCF/requirements/requirements.txt
index 78d1789d2c6..03541ba514a 100644
--- a/ALCF/requirements/requirements.txt
+++ b/ALCF/requirements/requirements.txt
@@ -15,6 +15,4 @@ six
 numpy<2
 schedulefree
 packaging>=20.0
-pydftracer
 wandb
-# git+https://github.com/saforem2/ezpz@main

From 5e9eed0443e5b448e924cd28d5696e82e475a9da Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Sun, 13 Oct 2024 10:16:54 -0500
Subject: [PATCH 13/31] Update `megatron/utils.py`

---
 megatron/utils.py | 94 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 34 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index d00f4cd0efb..3d5eef46723 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -4,24 +4,14 @@
 
 import sys
 import os
-import time
 import logging
-from typing import ContextManager, Optional
+from typing import Optional
 
 import torch
 from torch.nn.parallel import DistributedDataParallel as torchDDP
 
 from deepspeed.accelerator import get_accelerator
 
-if get_accelerator().device_name() == "cuda":
-    try:
-        from apex.multi_tensor_apply import multi_tensor_applier
-        import amp_C
-
-        HAS_APEX = True
-    except Exception:
-        HAS_APEX = False
-
 from megatron import get_args, get_adlr_autoresume, get_num_microbatches
 from megatron.core import mpu
 from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
@@ -30,70 +20,104 @@
 
 import ezpz as ez
 
+ACCELERATOR = get_accelerator()
+assert ACCELERATOR is not None
+
+if ACCELERATOR.device_name() == "cuda":
+    try:
+        from apex.multi_tensor_apply import multi_tensor_applier  # type:ignore
+        import amp_C  # type:ignore
+
+        HAS_APEX = True
+    except Exception:
+        HAS_APEX = False
+
 RANK = ez.get_rank()
 log = logging.getLogger(__name__)
-# log.setLevel("INFO") if RANK == 0 else log.setLevel("CRITICAL")
-
+log.setLevel(os.environ.get("LOG_LEVEL", ("INFO" if RANK == 0 else "CRITICAL")))
 
 _DLIO_PROFILER_EXIST = True
-_DFTRACER_EXIST=True
+_DFTRACER_EXIST = True
 
 try:
-    import dftracer
+    import dftracer  # type:ignore
 except Exception:
-    _DFTRACER_EXIST=False
+    _DFTRACER_EXIST = False
 
 try:
-    import dlio_profiler
+    import dlio_profiler  # type:ignore
 except Exception:
     _DLIO_PROFILER_EXIST = False
 
 
 if _DFTRACER_EXIST:
-    from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE
+    from dftracer.logger import (  # type:ignore
+        dftracer as PerfTrace,
+        dft_fn as Profile,
+        DFTRACER_ENABLE as DFTRACER_ENABLE,
+    )
 elif _DLIO_PROFILER_EXIST:
-    from dlio_profiler.logger import fn_interceptor as Profile
-    from dlio_profiler.logger import dlio_logger as PerfTrace
+    from dlio_profiler.logger import fn_interceptor as Profile  # type:ignore
+    from dlio_profiler.logger import dlio_logger as PerfTrace  # type:ignore
 else:
     from functools import wraps
-    # from contextlib import nullcontext
-    # Profile: ContextManager = nullcontext
-    #
-    # class Profile(nullable_schema)
+
     class Profile(object):
-        def __init__(self,  cat, name=None, epoch=None, step=None, image_idx=None, image_size=None):
-            return 
-        def log(self,  func):
+        def __init__(
+            self, cat, name=None, epoch=None, step=None, image_idx=None, image_size=None
+        ):
+            return
+
+        def log(self, func):
             return func
-        def log_init(self,  func):
+
+        def log_init(self, func):
             return func
-        def iter(self,  func, iter_name="step"):
+
+        def iter(self, func, iter_name="step"):
             return func
+
         def __enter__(self):
             return
+
         def __exit__(self, type, value, traceback):
             return
-        def update(self, epoch=None, step=None, image_idx=None, image_size=None, args={}):
+
+        def update(
+            self, epoch=None, step=None, image_idx=None, image_size=None, args={}
+        ):
             return
+
         def flush(self):
             return
+
         def reset(self):
             return
+
         def log_static(self, func):
             return
+
     class dftracer(object):
-        def __init__(self,):
+        def __init__(
+            self,
+        ):
             self.type = None
+
         def initialize_log(self, logfile=None, data_dir=None, process_id=-1):
             return
+
         def get_time(self):
             return
+
         def enter_event(self):
             return
+
         def exit_event(self):
             return
+
         def log_event(self, name, cat, start_time, duration, string_args=None):
             return
+
         def finalize(self):
             return
 
@@ -103,8 +127,8 @@ def finalize(self):
 
 def get_logger(
     name: str,
-    level: str = "INFO",
-    rank_zero_only: Optional[bool] = None,
+    level: Optional[str] = None,
+    rank_zero_only: Optional[bool] = True,
 ) -> logging.Logger:
     """Returns a `logging.Logger` object.
 
@@ -112,7 +136,9 @@ def get_logger(
     non-zero ranks (and will be set to `level` on RANK==0).
     """
     logger = logging.getLogger(name)
-    logger.setLevel(level)
+    logger.setLevel(
+        str(level if level is not None else os.environ.get("LOG_LEVEL", "INFO")).upper()
+    )
     if rank_zero_only and ez.get_rank() != 0:
         logger.setLevel("CRITICAL")
     return logger

From 3dcb2974da465b3d3a061215694464787737108b Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Sun, 13 Oct 2024 22:36:10 -0500
Subject: [PATCH 14/31] fixed bugs and added commandline option

---
 megatron/arguments.py        |  2 ++
 megatron/data/gpt_dataset.py | 10 ++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 9a5e4b8da7e..82e000923fc 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1252,6 +1252,8 @@ def _add_data_args(parser):
     group.add_argument('--data-file-list', type=str, default=None,
                        help='The file with the list of dataset and weights')
     
+    group.add_argument('--shuffle-sample', action='stored_true', help="Whether to shuffle the samples within in the dataset files")
+
     group.add_argument('--split', type=str, default='969, 30, 1',
                        help='Comma-separated list of proportions for training,'
                        ' validation, and test split. For example the split '
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index c801a6a5ae5..8c32be7d8e3 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -94,7 +94,7 @@ def Build(self):
 
         class BuildConcatDataset(torch.utils.data.Dataset):
             @dlp.log
-            def __init__(self, dataset_builders):
+            def __init__(self, dataset_builders, shuffle=False):
                 self.dataset_builders = dataset_builders
                 self.num_datasets = len(dataset_builders)
                 self.num_samples = np.sum([d.num_samples for d in dataset_builders])
@@ -117,7 +117,9 @@ def _build_indices():
 
                 self.dataset_index, self.dataset_sample_index = _build_indices()
                 np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
-                self.shuffle_index=np_rng.shuffle(range(self.num_samples))
+                self.shuffle_index = np.arange(self.num_samples)
+                if shuffle:
+                    np_rng.shuffle(self.shuffle_index)
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 
@@ -146,7 +148,7 @@ def __getitem__(self, idx):
         valid_datasets = []
         test_datasets = []
         # Build individual datasets.
-
+        args = get_args()
         @dlp.log
         def build_corpus_datasets(dataset_type='train'):
             start_time = time.time()
@@ -172,7 +174,7 @@ def build_corpus_datasets(dataset_type='train'):
             print_rank_0(" > number of samples for each corpus ")
             corpus_weights_achieved={}
             for c in corpus_list:
-                datasets.append(BuildConcatDataset(corpus_builders[c]))
+                datasets.append(BuildConcatDataset(corpus_builders[c], args.shuffle_sample))
                 total += datasets[-1].num_samples
                 corpus_weights_achieved[c] =  float(datasets[-1].num_samples)/train_num_samples                
                 print_rank_0(f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})")

From 43fc2feaace3781bb399e32b2f6e827920622c61 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Sun, 13 Oct 2024 23:00:21 -0500
Subject: [PATCH 15/31] fixed typo

---
 megatron/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 82e000923fc..307e725e515 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1252,7 +1252,7 @@ def _add_data_args(parser):
     group.add_argument('--data-file-list', type=str, default=None,
                        help='The file with the list of dataset and weights')
     
-    group.add_argument('--shuffle-sample', action='stored_true', help="Whether to shuffle the samples within in the dataset files")
+    group.add_argument('--shuffle-sample', action='store_true', help="Whether to shuffle the samples within in the dataset files")
 
     group.add_argument('--split', type=str, default='969, 30, 1',
                        help='Comma-separated list of proportions for training,'

From d50239f1c5470fd2b2495affc01773e063942e37 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 14 Oct 2024 14:20:56 -0500
Subject: [PATCH 16/31] added support for blending samples across different
 files in the same corpus

---
 megatron/arguments.py        |  4 +++-
 megatron/data/gpt_dataset.py | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 9ab3e40953d..9b0e6ccb1ac 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1291,7 +1291,9 @@ def _add_data_args(parser):
     group.add_argument('--data-file-list', type=str, default=None,
                        help='The file with the list of dataset and weights')
     
-    group.add_argument('--shuffle-sample', action='store_true', help="Whether to shuffle the samples within in the dataset files")
+    group.add_argument('--shuffle-sample-in-corpus', action='store_true', help="Whether to shuffle the samples within in the dataset files")
+
+    group.add_argument('--blend-sample-in-corpus', action='store_true', help="Whether to blend different files in the same corpus")
 
     group.add_argument('--split', type=str, default='969, 30, 1',
                        help='Comma-separated list of proportions for training,'
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index c412d02b31c..38df5562675 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -131,19 +131,35 @@ def Build(self):
                 self.build = True
                 return self.dataset
 
-        class BuildConcatDataset(torch.utils.data.Dataset):
+        class BuildCorpusDataset(torch.utils.data.Dataset):
             @dlp.log
-            def __init__(self, dataset_builders, shuffle=False):
+            def __init__(self, dataset_builders):
                 self.dataset_builders = dataset_builders
                 self.num_datasets = len(dataset_builders)
                 self.num_samples = np.sum([d.num_samples for d in dataset_builders])
                 self.indices = np.zeros((self.num_samples, 2), dtype=np.uint64)
-                self.desc = "ConcatDataset:"
+                self.desc = "CorpusDataset:"
                 # m = 0
                 num_samples_list = np.array([d.num_samples for d in dataset_builders])
                 self.num_samples = np.sum(num_samples_list)
+                args = get_args()
 
-                def _build_indices():
+                @dlp.log
+                def _build_indices_blended():
+                    start_time = time.time()
+                    dataset_index = np.zeros(self.num_samples, dtype=np.int64)
+                    dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64)
+                    weights = num_samples_list / self.num_samples
+                    helpers.build_blending_indices(
+                        dataset_index, dataset_sample_index,
+                        weights, self.num_datasets, self.num_samples,
+                        torch.distributed.get_rank() == 0)
+                    log.debug('> elapsed time for building blendable dataset indices for corpus {self.dataset_builders[0].corpus}: '
+                             '{:.2f} (sec)'.format(time.time() - start_time))
+                    return dataset_index, dataset_sample_index
+
+
+                def _build_indices_concat():
                     start_time = time.time()
                     dataset_index = np.zeros(self.num_samples, dtype=np.int64)
                     dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64)
@@ -159,11 +175,15 @@ def _build_indices():
                         "{:.2f} (sec)".format(time.time() - start_time)
                     )
                     return dataset_index, dataset_sample_index
-
-                self.dataset_index, self.dataset_sample_index = _build_indices()
+                
+                if args.blend_sample_in_corpus:
+                    self.dataset_index, self.dataset_sample_index = _build_indices_blended()                    
+                else:
+                    self.dataset_index, self.dataset_sample_index = _build_indices_concat()
+                    
                 np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
                 self.shuffle_index = np.arange(self.num_samples)
-                if shuffle:
+                if args.shuffle_sample_in_corpus:
                     np_rng.shuffle(self.shuffle_index)
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
@@ -243,7 +263,7 @@ def build_corpus_datasets(dataset_type="train"):
             log.debug(" > number of samples for each corpus ")
             corpus_weights_achieved = {}
             for c in corpus_list:
-                datasets.append(BuildConcatDataset(corpus_builders[c], args.shuffle_sample))
+                datasets.append(BuildCorpusDataset(corpus_builders[c]))
                 total += datasets[-1].num_samples
                 corpus_weights_achieved[c] = (
                     float(datasets[-1].num_samples) / train_num_samples

From 45ff652dc473fa07ec674543199b2a7785e7f9a6 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Mon, 14 Oct 2024 23:28:00 -0500
Subject: [PATCH 17/31] Discard changes to megatron/data/gpt_dataset.py

---
 megatron/data/gpt_dataset.py | 921 ++++++++++++-----------------------
 1 file changed, 325 insertions(+), 596 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index f62b567ddc4..0cf97356a41 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -5,114 +5,71 @@
 import hashlib
 import os
 import time
-import logging
 
 import numpy as np
 import torch
 from deepspeed.accelerator import get_accelerator
-from megatron import is_rank_0, get_args
+from megatron import print_rank_0, is_rank_0, get_args
 from megatron.core import mpu
-from megatron.data import helpers  # type:ignore
+from megatron.data import helpers
 from megatron.data.blendable_dataset import BlendableDataset
-from megatron.data.dataset_utils import (
-    get_datasets_weights_and_num_samples,
-    get_datasets_corpuses_weights_and_num_samples,
-)
+from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_datasets_corpuses_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 
-from megatron.utils import PerfTrace, Profile, get_logger
+from megatron.utils import PerfTrace, Profile
 from mpi4py import MPI
 
-try:
-    import ezpz as ez
-    RANK = ez.get_rank()
-except Exception:
-    RANK = torch.distributed.get_rank()
-
-# NOTE: [logging]-----------------------------------------------------------
-# - Set logging level to "INFO" on RANK == 0, "CRITICAL" on all other ranks
-log = logging.getLogger(__name__)
-LOG_LEVEL = str(os.environ.get("LOG_LEVEL", "INFO")).upper()
-log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL")
-# --------------------------------------------------------------------------
-
 dlp = Profile("DATASET")
 
-log = get_logger(__name__, rank_zero_only=True)
-
-
 @dlp.log
-def build_train_valid_test_datasets(
-    data_prefix,
-    data_impl,
-    splits_string,
-    train_valid_test_num_samples,
-    seq_length,
-    seed,
-    skip_warmup,
-    train_data_prefix=None,
-    valid_data_prefix=None,
-    test_data_prefix=None,
-    return_doc_ids=False,
-    *,
-    data_cache_path=None,
-):
+def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                    train_valid_test_num_samples,
+                                    seq_length, seed, skip_warmup,
+                                    train_data_prefix=None,
+                                    valid_data_prefix=None,
+                                    test_data_prefix=None,
+                                    return_doc_ids=False, *,
+                                    data_cache_path=None):
     """Build train, valid, and test datasets."""
 
     if data_prefix:
-        log.debug("Single data path provided for train, valid & test")
+        print_rank_0("Single data path provided for train, valid & test")
 
         # Single dataset.
         if len(data_prefix) == 1:
-            return _build_train_valid_test_datasets(
-                data_prefix[0],
-                data_impl,
-                splits_string,
-                train_valid_test_num_samples,
-                seq_length,
-                seed,
-                skip_warmup,
-                data_cache_path=data_cache_path,
-            )
+            return _build_train_valid_test_datasets(data_prefix[0],
+                                                    data_impl, splits_string,
+                                                    train_valid_test_num_samples,
+                                                    seq_length, seed, skip_warmup,
+                                                    data_cache_path=data_cache_path)
 
         # Blending dataset.
         # Parse the values.
-        output = get_datasets_corpuses_weights_and_num_samples(
-            data_prefix, train_valid_test_num_samples
-        )
+        output = get_datasets_corpuses_weights_and_num_samples(data_prefix,
+                                                      train_valid_test_num_samples)
         prefixes, corpuses, weights, datasets_train_valid_test_num_samples = output
         corpus_list = sorted(set(corpuses))
         train_num_samples, valid_num_samples, test_num_samples = map(
-            sum, zip(*datasets_train_valid_test_num_samples)
+            sum,
+            zip(*datasets_train_valid_test_num_samples)
         )
 
         class DatasetBuilder:
-            """
+            ''' 
             This is for building individual dataset from each dataset file
-            """
-
+            '''
             @dlp.log
-            def __init__(
-                self,
-                prefix,
-                corpus,
-                data_impl,
-                splits_string,
-                num_samples,
-                seq_length,
-                seed,
-                skip_warmup,
-                return_doc_ids,
-                data_cache_path=data_cache_path,
-                name="train",
-            ):
+            def __init__(self, prefix, corpus, data_impl, splits_string,
+                         num_samples, seq_length, seed, skip_warmup,
+                         return_doc_ids,
+                         data_cache_path=data_cache_path, name='train'):
                 self.prefix = prefix
                 self.data_impl = data_impl
                 self.splits_string = splits_string
-                if name == "train":
+                if name == 'train':
                     self.num_samples = num_samples[0]
-                elif name == "valid":
+                elif name == 'valid':
                     self.num_samples = num_samples[1]
                 else:
                     self.num_samples = num_samples[2]
@@ -127,413 +84,279 @@ def __init__(
                 self.desc = prefix + f"{self.num_samples}" + f"{seq_length}" + f"{seed}"
                 self.build = False
                 self.corpus = corpus
-
             @dlp.log
             def Build(self):
-                self.dataset = _build_train_valid_test_datasets_single(
-                    self.prefix,
-                    self.data_impl,
-                    self.splits_string,
-                    self.num_samples_train_valid_test,
-                    self.seq_length,
-                    self.seed,
-                    self.skip_warmup,
-                    self.name,
-                    self.return_doc_ids,
-                    data_cache_path=self.data_cache_path,
-                )
+                self.dataset = _build_train_valid_test_datasets_single(self.prefix, self.data_impl, self.splits_string,
+                    self.num_samples_train_valid_test, self.seq_length, self.seed, self.skip_warmup, self.name, self.return_doc_ids, 
+                    data_cache_path=self.data_cache_path)
                 self.build = True
                 return self.dataset
 
         class BuildConcatDataset(torch.utils.data.Dataset):
             @dlp.log
-            def __init__(self, dataset_builders, shuffle=False):
+            def __init__(self, dataset_builders):
                 self.dataset_builders = dataset_builders
                 self.num_datasets = len(dataset_builders)
                 self.num_samples = np.sum([d.num_samples for d in dataset_builders])
-                self.indices = np.zeros((self.num_samples, 2), dtype=np.uint64)
-                self.desc = "ConcatDataset:"
-                # m = 0
+                self.indices=np.zeros((self.num_samples, 2), dtype=np.uint64)
+                self.desc="ConcatDataset:"
+                m = 0
                 num_samples_list = np.array([d.num_samples for d in dataset_builders])
                 self.num_samples = np.sum(num_samples_list)
-
                 def _build_indices():
                     start_time = time.time()
                     dataset_index = np.zeros(self.num_samples, dtype=np.int64)
                     dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64)
-                    helpers.build_concat_indices(
-                        dataset_index,
-                        dataset_sample_index,
-                        num_samples_list,
-                        self.num_datasets,
-                        torch.distributed.get_rank() == 0,
-                    )
-                    log.debug(
-                        "> elapsed time for building concat dataset indices: "
-                        "{:.2f} (sec)".format(time.time() - start_time)
-                    )
+                    helpers.build_concat_indices(dataset_index, dataset_sample_index,
+                                                 num_samples_list, 
+                                                 self.num_datasets, 
+                                                 torch.distributed.get_rank()==0)
+                    print_rank_0('> elapsed time for building concat dataset indices: '
+                                 '{:.2f} (sec)'.format(time.time() - start_time))
                     return dataset_index, dataset_sample_index
-
+                
                 self.dataset_index, self.dataset_sample_index = _build_indices()
-                np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
-                self.shuffle_index = np.arange(self.num_samples)
-                if shuffle:
-                    np_rng.shuffle(self.shuffle_index)
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 
-                self.desc += (
-                    f"-{self.num_samples}"
-                    + f"-{dataset_builders[0].seq_length}"
-                    + f"{dataset_builders[0].seed}"
-                )
-
+                self.desc += f"-{self.num_samples}" + f"-{dataset_builders[0].seq_length}" + f"{dataset_builders[0].seed}"
             def __len__(self):
                 return self.num_samples
 
             @dlp.log
             def __getitem__(self, idx):
-                id_shuffle = self.shuffle_index[idx]
-                i = self.dataset_index[id_shuffle]
-                j = self.dataset_sample_index[id_shuffle]
+                if idx >= self.num_samples:
+                    print_rank_0(f"WARNING: index overflow encountered {idx} > {self.num_samples} for {self.dataset_builders[0].corpus}; will randomly pick one sample")
+                    id = np.random.randint(self.num_samples)
+                else:
+                    id = idx
+                i = self.dataset_index[idx]
+                j = self.dataset_sample_index[idx]
                 if self.dataset_builders[i].build:
                     return self.dataset_builders[i].dataset[j]
                 else:
                     return self.dataset_builders[i].Build()[j]
+            
 
-        # Predetermine whether need to build the specific dataset or not.
+        # Predetermine whether need to build the specific dataset or not. 
         start_time = time.time()
-        log.debug(" >>> Started building datasets in distributed way ... ")
+        print_rank_0(" >>> Started building datasets in distributed way ... ")
 
         a, b, c = [int(d) for d in splits_string.split(",")]
-
+        
         train_datasets = []
         valid_datasets = []
         test_datasets = []
         # Build individual datasets.
-        args = get_args()
+
         @dlp.log
-        def build_corpus_datasets(dataset_type="train"):
+        def build_corpus_datasets(dataset_type='train'):
             start_time = time.time()
-            log.debug(f" >>> Building {dataset_type} corpus datasets ...")
+            print_rank_0(f" >>> Building {dataset_type} corpus datasets ...")
             datasets = []
             corpus_builders = {}
             corpus_weights = {}
             for c in corpus_list:
                 corpus_builders[c] = []
                 corpus_weights[c] = 0.0
-            dataset_builders = [
-                DatasetBuilder(
-                    prefixes[i],
-                    corpuses[i],
-                    data_impl,
-                    splits_string,
-                    datasets_train_valid_test_num_samples[i],
-                    seq_length,
-                    seed,
-                    skip_warmup,
-                    return_doc_ids,
-                    data_cache_path,
-                    dataset_type,
-                )
-                for i in range(len(weights))
-            ]
-            for i in range(
-                torch.distributed.get_rank()
-                // mpu.get_tensor_model_parallel_world_size(),
-                len(weights),
-                torch.distributed.get_world_size()
-                // mpu.get_tensor_model_parallel_world_size(),
-            ):
+            dataset_builders = [DatasetBuilder(prefixes[i], corpuses[i], data_impl, splits_string,
+                                               datasets_train_valid_test_num_samples[i],
+                                               seq_length, seed, skip_warmup,
+                                               return_doc_ids,data_cache_path, dataset_type) for i in  range(len(weights))]
+            for i in range(torch.distributed.get_rank()//mpu.get_tensor_model_parallel_world_size(), len(weights),  torch.distributed.get_world_size()//mpu.get_tensor_model_parallel_world_size()):
                 dataset_builders[i].Build()
-            log.debug(
-                f" >>> Finished building individual datasets in {time.time() - start_time} seconds"
-            )
+            print_rank_0(f" >>> Finished building individual datasets in {time.time() - start_time} seconds")
             start_concating_time = time.time()
             for i, d in zip(range(len(weights)), dataset_builders):
                 corpus_builders[d.corpus].append(d)
                 corpus_weights[d.corpus] += weights[i]
             total = 0
-            log.debug(" > number of samples for each corpus ")
-            corpus_weights_achieved = {}
+            print_rank_0(" > number of samples for each corpus ")
+            corpus_weights_achieved={}
             for c in corpus_list:
-                datasets.append(BuildConcatDataset(corpus_builders[c], args.shuffle_sample))
+                datasets.append(BuildConcatDataset(corpus_builders[c]))
                 total += datasets[-1].num_samples
-                corpus_weights_achieved[c] = (
-                    float(datasets[-1].num_samples) / train_num_samples
-                )
-                log.debug(
-                    f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})"
-                )
-
-            log.debug(f" > total number of samples: {total}")
-            log.debug(
-                f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds"
-            )
-            log.debug(
-                f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds"
-            )
+                corpus_weights_achieved[c] =  float(datasets[-1].num_samples)/train_num_samples                
+                print_rank_0(f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})")
+            
+            print_rank_0(f" > total number of samples: {total}")
+            print_rank_0(f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds")
+            print_rank_0(f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds")
             return datasets, [corpus_weights_achieved[c] for c in corpus_list]
 
-        train_weights = None
         if a > 0:
-            train_datasets, train_weights = build_corpus_datasets("train")
-        valid_weights = None
+            train_datasets, train_weights = build_corpus_datasets('train')
+
         if b > 0:
-            valid_datasets, valid_weights = build_corpus_datasets("valid")
-        test_weights = None
-        if c > 0:
-            test_datasets, test_weights = build_corpus_datasets("test")
+            valid_datasets, valid_weights = build_corpus_datasets('valid')
+            
+        if c > 0:            
+            test_datasets, test_weights = build_corpus_datasets('test')
 
         # This barrier is critical to make sure that all the datasets are built once
         # and the metadata were written to the cache folder before other ranks touch them
-        log.debug(
-            f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds"
-        )
+        print_rank_0(f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds")
         torch.distributed.barrier(group=mpu.get_data_parallel_group())
         torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group())
         torch.distributed.barrier(group=mpu.get_data_parallel_group())
-        log.debug(
-            f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds"
-        )
-        log.debug(" >>> Starting to build BlendableDataset")
+        print_rank_0(f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds")
+        print_rank_0(f" >>> Starting to build BlendableDataset")
         # Blend.
         start_time = time.time()
         blending_train_dataset = None
-        if train_datasets and train_weights:
-            blending_train_dataset = BlendableDataset(
-                train_datasets,
-                train_weights,
-                train_num_samples,
-                data_cache_path=data_cache_path,
-            )
+        if train_datasets:
+            blending_train_dataset = BlendableDataset(train_datasets, train_weights, train_num_samples,
+                                                      data_cache_path=data_cache_path)
         blending_valid_dataset = None
-        if valid_datasets and valid_weights:
-            blending_valid_dataset = BlendableDataset(
-                valid_datasets,
-                valid_weights,
-                valid_num_samples,
-                data_cache_path=data_cache_path,
-            )
+        if valid_datasets:
+            blending_valid_dataset = BlendableDataset(valid_datasets, valid_weights, valid_num_samples,
+                                                      data_cache_path=data_cache_path)
         blending_test_dataset = None
-        if test_datasets and test_weights:
-            blending_test_dataset = BlendableDataset(
-                test_datasets,
-                test_weights,
-                test_num_samples,
-                data_cache_path=data_cache_path,
-            )
+        if test_datasets:
+            blending_test_dataset = BlendableDataset(test_datasets, test_weights, test_num_samples,
+                                                     data_cache_path=data_cache_path)
         end_time = time.time()
-        log.debug(
-            f" >>> Finished building BlendableDataset in {end_time - start_time} seconds"
-        )
-        return (blending_train_dataset, blending_valid_dataset, blending_test_dataset)
+        print_rank_0(f" >>> Finished building BlendableDataset in {end_time - start_time} seconds")
+        return (blending_train_dataset, blending_valid_dataset,
+                blending_test_dataset)
 
     else:
-        log.debug(
-            "Separate data paths provided for train, valid & test. Split string will be ignored."
-        )
+        print_rank_0("Separate data paths provided for train, valid & test. Split string will be ignored.")
 
         train_dataset, valid_dataset, test_dataset = None, None, None
         # Single dataset.
         if train_data_prefix is not None:
-            train_dataset = build_dataset(
-                "train",
-                train_data_prefix,
-                data_impl,
-                splits_string,
-                train_valid_test_num_samples[0],
-                seq_length,
-                seed,
-                skip_warmup,
-                data_cache_path=data_cache_path,
-            )
+            train_dataset = build_dataset("train", train_data_prefix, data_impl,
+                                          splits_string,
+                                          train_valid_test_num_samples[0],
+                                          seq_length, seed, skip_warmup,
+                                          data_cache_path=data_cache_path)
 
         if valid_data_prefix is not None:
-            valid_dataset = build_dataset(
-                "valid",
-                valid_data_prefix,
-                data_impl,
-                splits_string,
-                train_valid_test_num_samples[1],
-                seq_length,
-                seed,
-                False,
-                data_cache_path=data_cache_path,
-            )
+            valid_dataset = build_dataset("valid", valid_data_prefix, data_impl,
+                                          splits_string,
+                                          train_valid_test_num_samples[1],
+                                          seq_length, seed, False,
+                                          data_cache_path=data_cache_path)
+
 
         if test_data_prefix is not None:
-            test_dataset = build_dataset(
-                "test",
-                test_data_prefix,
-                data_impl,
-                splits_string,
-                train_valid_test_num_samples[2],
-                seq_length,
-                seed,
-                False,
-                data_cache_path=data_cache_path,
-            )
+            test_dataset = build_dataset("test", test_data_prefix, data_impl,
+                                         splits_string,
+                                         train_valid_test_num_samples[2],
+                                         seq_length, seed, False,
+                                         data_cache_path=data_cache_path)
 
         return (train_dataset, valid_dataset, test_dataset)
 
-
 @dlp.log
-def _build_train_valid_test_datasets(
-    data_prefix,
-    data_impl,
-    splits_string,
-    train_valid_test_num_samples,
-    seq_length,
-    seed,
-    skip_warmup,
-    return_doc_ids=False,
-    *,
-    data_cache_path=None,
-):
+def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+                                     train_valid_test_num_samples,
+                                     seq_length, seed, skip_warmup,
+                                     return_doc_ids=False, *,
+                                     data_cache_path=None):
     """Build train, valid, and test datasets."""
 
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
-    log.debug(" > dataset split:")
+    print_rank_0(' > dataset split:')
 
     def print_split_stats(name, index):
-        log.debug("    {}:".format(name))
-        log.debug(
-            "     document indices in [{}, {}) total of {} " "documents".format(
-                splits[index], splits[index + 1], splits[index + 1] - splits[index]
-            )
-        )
-
-    print_split_stats("train", 0)
-    print_split_stats("validation", 1)
-    print_split_stats("test", 2)
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
 
     def build_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
-            documents = np.arange(
-                start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
-            )
-            dataset = GPTDataset(
-                name,
-                data_prefix,
-                documents,
-                indexed_dataset,
-                splits_string,
-                train_valid_test_num_samples[index],
-                seq_length,
-                seed,
-                return_doc_ids,
-                data_cache_path=data_cache_path,
-            )
+            documents = np.arange(start=splits[index], stop=splits[index + 1],
+                                  step=1, dtype=np.int32)
+            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
+                                 splits_string,
+                                 train_valid_test_num_samples[index],
+                                 seq_length, seed,
+                                 return_doc_ids,
+                                 data_cache_path=data_cache_path)
         return dataset
 
-    train_dataset = build_dataset(0, "train")
-    valid_dataset = build_dataset(1, "valid")
-    test_dataset = build_dataset(2, "test")
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
 
     return (train_dataset, valid_dataset, test_dataset)
 
-
 @dlp.log
-def _build_train_valid_test_datasets_single(
-    data_prefix,
-    data_impl,
-    splits_string,
-    train_valid_test_num_samples,
-    seq_length,
-    seed,
-    skip_warmup,
-    name,
-    return_doc_ids=False,
-    *,
-    data_cache_path=None,
-):
+def _build_train_valid_test_datasets_single(data_prefix, data_impl, splits_string,
+                            train_valid_test_num_samples,
+                            seq_length, seed, skip_warmup, name, 
+                            return_doc_ids=False, *,
+                            data_cache_path=None):
     """Build train, valid, and test datasets."""
 
     # Each rank print out information
-    log.debug(f" >> building dataset for {data_prefix}")
+    print_rank_0(f" >> building dataset for {data_prefix}")
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
-    log.debug(" > dataset split:")
+    print_rank_0(' > dataset split:')
 
     def print_split_stats(name, index):
-        log.debug("    {}:".format(name))
-        log.debug(
-            "     document indices in [{}, {}) total of {} " "documents".format(
-                splits[index], splits[index + 1], splits[index + 1] - splits[index]
-            )
-        )
-
-    print_split_stats("train", 0)
-    print_split_stats("validation", 1)
-    print_split_stats("test", 2)
+        print_rank_0('    {}:'.format(name))
+        print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[index], splits[index + 1],
+                                        splits[index + 1] - splits[index]))
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
 
     def build_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
-            documents = np.arange(
-                start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
-            )
-            dataset = GPTDataset(
-                name,
-                data_prefix,
-                documents,
-                indexed_dataset,
-                splits_string,
-                train_valid_test_num_samples[index],
-                seq_length,
-                seed,
-                return_doc_ids,
-                data_cache_path=data_cache_path,
-            )
+            documents = np.arange(start=splits[index], stop=splits[index + 1],
+                                  step=1, dtype=np.int32)
+            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
+                                 splits_string,
+                                 train_valid_test_num_samples[index],
+                                 seq_length, seed,
+                                 return_doc_ids,
+                                 data_cache_path=data_cache_path)
         return dataset
-
-    if name.find("train") != -1:
-        return build_dataset(0, "train")
-    if name.find("valid") != -1:
-        return build_dataset(1, "valid")
-    if name.find("test") != -1:
-        return build_dataset(2, "test")
-
+    if name.find("train")!=-1:
+        return build_dataset(0, 'train')
+    if name.find("valid")!=-1:
+        return build_dataset(1, 'valid')
+    if name.find("test")!=-1:
+        return build_dataset(2, 'test')
 
 @dlp.log
-def build_dataset(
-    dataset_name,
-    data_prefix,
-    data_impl,
-    splits_string,
-    num_samples,
-    seq_length,
-    seed,
-    skip_warmup,
-    *,
-    data_cache_path=None,
-):
+def build_dataset(dataset_name, data_prefix, data_impl,
+                  splits_string, num_samples,
+                  seq_length, seed, skip_warmup,
+                  *,
+                  data_cache_path=None):
     dataset = None
     if len(data_prefix) == 1:
-        dataset = _build_dataset(
-            dataset_name,
-            data_prefix[0],
-            data_impl,
-            splits_string,
-            num_samples,
-            seq_length,
-            seed,
-            skip_warmup,
-            data_cache_path=data_cache_path,
-        )
+        dataset = _build_dataset(dataset_name, data_prefix[0], data_impl,
+                                 splits_string, num_samples, seq_length,
+                                 seed, skip_warmup,
+                                 data_cache_path=data_cache_path)
     else:
         # Blending dataset.
         # Parse the values.
@@ -544,108 +367,73 @@ def build_dataset(
         # Build individual datasets.
         datasets = []
         for i in range(len(prefixes)):
-            ds = _build_dataset(
-                dataset_name,
-                prefixes[i],
-                data_impl,
-                splits_string,
-                dataset_num_samples[i],
-                seq_length,
-                seed,
-                skip_warmup,
-                data_cache_path=data_cache_path,
-            )
+            ds = _build_dataset(dataset_name, prefixes[i], data_impl,
+                                splits_string, dataset_num_samples[i],
+                                seq_length, seed, skip_warmup,
+                                data_cache_path=data_cache_path)
             if ds:
                 datasets.append(ds)
 
         if datasets:
-            dataset = BlendableDataset(
-                datasets, weights, num_samples, data_cache_path=data_cache_path
-            )
+            dataset = BlendableDataset(datasets, weights, num_samples,
+                                       data_cache_path=data_cache_path)
 
     return dataset
 
-
 @dlp.log
-def _build_dataset(
-    dataset_name,
-    data_prefix,
-    data_impl,
-    splits_string,
-    num_samples,
-    seq_length,
-    seed,
-    skip_warmup,
-    *,
-    data_cache_path=None,
-):
+def _build_dataset(dataset_name, data_prefix, data_impl, splits_string,
+                   num_samples, seq_length, seed, skip_warmup,
+                   *,
+                   data_cache_path=None):
     """
     Build dataset. This method is called when individual
     train, valid, test datasets are provided
     """
 
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
 
-    log.debug("    {}:".format(dataset_name))
-    log.debug(
-        "     document indices in [0, {}) total of {} " "documents".format(
-            total_num_of_documents, total_num_of_documents
-        )
-    )
-
-    documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32)
-
-    dataset = GPTDataset(
-        dataset_name,
-        data_prefix,
-        documents,
-        indexed_dataset,
-        splits_string,
-        num_samples,
-        seq_length,
-        seed,
-        data_cache_path=data_cache_path,
-    )
+    print_rank_0('    {}:'.format(dataset_name))
+    print_rank_0('     document indices in [0, {}) total of {} '
+                 'documents'.format(total_num_of_documents, total_num_of_documents))
 
-    return dataset
+    documents = np.arange(start=0, stop=total_num_of_documents,
+                        step=1, dtype=np.int32)
 
+    dataset = GPTDataset(dataset_name, data_prefix, documents, indexed_dataset,
+                         splits_string, num_samples, seq_length, seed,
+                         data_cache_path=data_cache_path)
+
+    return dataset
 
 @dlp.log
 def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
     """Build indexed dataset."""
-    log.debug(" > building dataset index ...")
+    print_rank_0(' > building dataset index ...')
 
     start_time = time.time()
-    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
-    log.debug(
-        " > finished creating indexed dataset in {:4f} " "seconds".format(
-            time.time() - start_time
-        )
-    )
-    log.debug("    number of documents: {}".format(indexed_dataset.sizes.shape[0]))
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+    print_rank_0(' > finished creating indexed dataset in {:4f} '
+                 'seconds'.format(time.time() - start_time))
+    print_rank_0('    number of documents: {}'.format(
+        indexed_dataset.sizes.shape[0]))
 
     return indexed_dataset
 
 
 class GPTDataset(torch.utils.data.Dataset):
     @dlp.log
-    def __init__(
-        self,
-        name,
-        data_prefix,
-        documents,
-        indexed_dataset,
-        splits_string,
-        num_samples,
-        seq_length,
-        seed,
-        return_doc_ids=False,
-        *,
-        data_cache_path=None,
-    ):
+    def __init__(self, name, data_prefix, documents, indexed_dataset,
+                 splits_string, num_samples, seq_length, seed,
+                 return_doc_ids=False, *,
+                 data_cache_path=None):
+
         self.name = name
         self.indexed_dataset = indexed_dataset
         self.return_doc_ids = return_doc_ids
@@ -655,29 +443,20 @@ def __init__(
         assert np.max(documents) < indexed_dataset.sizes.shape[0]
 
         # Build index mappings.
-        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = (
-            _build_index_mappings(
-                self.name,
-                data_prefix,
-                documents,
-                self.indexed_dataset.sizes,
-                splits_string,
-                num_samples,
-                seq_length,
-                seed,
-                data_cache_path=data_cache_path,
-            )
-        )
+        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = \
+            _build_index_mappings(self.name, data_prefix,
+                                  documents, self.indexed_dataset.sizes,
+                                  splits_string, num_samples, seq_length, seed,
+                                  data_cache_path=data_cache_path)
+
 
     def __len__(self):
         # -1 is due to data structure used to retieve the index:
         #    sample i --> [sample_idx[i], sample_idx[i+1])
         return self.sample_idx.shape[0] - 1
-
     @dlp.log
     def __getitem__(self, idx):
         args = get_args()
-        assert args is not None
         orig_idx = idx
         # Get the shuffled index.
         try:
@@ -686,24 +465,21 @@ def __getitem__(self, idx):
             if is_rank_0():
                 import json
                 from rich import print_json
-
                 print(exc)
                 print(
-                    "\n".join(
-                        [
-                            "-------------------------------------------------",
-                            f"Trying to access {idx=} from self.shuffle_idx,",
-                            f"but {len(self.shuffle_idx)=}",
-                            "-------------------------------------------------",
-                        ]
+                    '\n'.join(
+                        ['-------------------------------------------------',
+                         f'Trying to access {idx=} from self.shuffle_idx,',
+                         f'but {len(self.shuffle_idx)=}',
+                         '-------------------------------------------------']
                     )
                 )
                 print_json(
                     json.dumps(
                         {
-                            "doc_idx": len(self.doc_idx),
-                            "sample_idx": len(self.sample_idx),
-                            "shuffle_idx": len(self.shuffle_idx),
+                            'doc_idx': len(self.doc_idx),
+                            'sample_idx': len(self.sample_idx),
+                            'shuffle_idx': len(self.shuffle_idx),
                         },
                         indent=4,
                     )
@@ -717,57 +493,45 @@ def __getitem__(self, idx):
         doc_ids = []
         if doc_index_f == doc_index_l:
             doc_ids.append(self.doc_idx[doc_index_f])
-            sample = self.indexed_dataset.get(
-                self.doc_idx[doc_index_f],
-                offset=offset_f,
-                length=offset_l - offset_f + 1,
-            )
+            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                              offset=offset_f,
+                                              length=offset_l - offset_f + 1)
         else:
             # Otherwise, get the rest of the initial document.
             doc_ids.append(self.doc_idx[doc_index_f])
-            sample_list = [
-                self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
-            ]
+            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                                    offset=offset_f)]
             # Loop over all in between documents and add the entire document.
             for i in range(doc_index_f + 1, doc_index_l):
                 doc_ids.append(self.doc_idx[i])
                 sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
             # And finally add the relevant portion of last document.
             doc_ids.append(self.doc_idx[doc_index_l])
-            sample_list.append(
-                self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
-            )
+            sample_list.append(self.indexed_dataset.get(
+                self.doc_idx[doc_index_l],
+                length=offset_l + 1))
             sample = np.concatenate(sample_list)
 
-        text_name = "text"
+        text_name = 'text'
         if args.use_dataset_only:
-            text_name = "input_ids"
+            text_name = 'input_ids'
         sample_dict = {text_name: np.array(sample, dtype=np.int64)}
         if args.return_data_index:
-            sample_dict.update({"index": np.array([orig_idx], dtype=np.int64)})
+            sample_dict.update({'index': np.array([orig_idx], dtype=np.int64)})
 
-        if self.return_doc_ids:  # for retro preprocessing
-            sample_dict.update({"doc_ids": np.array(doc_ids, dtype=np.int64)})
+        if self.return_doc_ids: # for retro preprocessing
+            sample_dict.update({'doc_ids': np.array(doc_ids, dtype=np.int64)})
 
         if args.use_dataset_only:
-            sample_dict.update({"labels": np.array(sample, dtype=np.int64)})
+            sample_dict.update({'labels': np.array(sample, dtype=np.int64)})
 
         return sample_dict
 
-
 @dlp.log
-def _build_index_mappings(
-    name,
-    data_prefix,
-    documents,
-    sizes,
-    splits_string,
-    num_samples,
-    seq_length,
-    seed,
-    *,
-    data_cache_path,
-):
+def _build_index_mappings(name, data_prefix, documents, sizes,
+                          splits_string, num_samples, seq_length, seed,
+                          *,
+                          data_cache_path):
     """Build doc-idx, sample-idx, and shuffle-idx.
     doc-idx: is an array (ordered) of documents to be used in training.
     sample-idx: is the start document index and document offset for each
@@ -775,11 +539,10 @@ def _build_index_mappings(
     shuffle-idx: maps the sample index into a random index into sample-idx.
     """
     args = get_args()
-    assert args is not None
     # Number of tokens in each epoch and number of required epochs.
     tokens_per_epoch = _num_tokens(documents, sizes)
     num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
-    if args.train_data_exact_num_epochs is not None and name == "train":
+    if args.train_data_exact_num_epochs is not None and name == 'train':
         num_epochs = args.train_data_exact_num_epochs
 
     # rng state
@@ -794,13 +557,13 @@ def _build_index_mappings(
     desc += f"Sequence length {seq_length}\n"
     desc += f"Random seed {seed}\n"
     desc += f"Split {splits_string}\n"
-    desc_hash = hashlib.md5(desc.encode("utf-8")).hexdigest()
+    desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
     desc_filename = desc_hash + ".dsc"
-    doc_idx_filename = desc_hash + "_doc_idx.npy"
-    sample_idx_filename = desc_hash + "_sample_idx.npy"
-    shuffle_idx_filename = desc_hash + "_shuffle_idx.npy"
+    doc_idx_filename = desc_hash + '_doc_idx.npy'
+    sample_idx_filename = desc_hash + '_sample_idx.npy'
+    shuffle_idx_filename = desc_hash + '_shuffle_idx.npy'
 
-    if name == "train":
+    if name == 'train':
         # force to use certain index files
         if args.train_desc_path is not None:
             desc_filename = args.train_desc_path
@@ -815,15 +578,15 @@ def _build_index_mappings(
     # duplication, then look in data-cache-path if specified,
     # If nothing is found, use the last path looked in
     build_indices = True
-    prefixes = [os.path.join(os.path.dirname(data_prefix), "index-cache")]
+    prefixes = [os.path.join(os.path.dirname(data_prefix), 'index-cache')]
     if data_cache_path is not None:
         prefixes.append(data_cache_path)
     for prefix in prefixes:
         idx_path = {
-            "desc": os.path.join(prefix, desc_filename),
-            "doc": os.path.join(prefix, doc_idx_filename),
-            "sample": os.path.join(prefix, sample_idx_filename),
-            "shuffle": os.path.join(prefix, shuffle_idx_filename),
+            'desc': os.path.join(prefix, desc_filename),
+            'doc': os.path.join(prefix, doc_idx_filename),
+            'sample': os.path.join(prefix, sample_idx_filename),
+            'shuffle': os.path.join(prefix, shuffle_idx_filename)
         }
         for f in idx_path.values():
             if not os.path.isfile(f):
@@ -832,17 +595,15 @@ def _build_index_mappings(
             # Found our files!
             build_indices = False
             break
-    data_cache_dir = os.path.dirname(idx_path["desc"])
+    data_cache_dir = os.path.dirname(idx_path['desc'])
     data_cache_success = True
 
     # Build the indexed mapping if not exist.
     if build_indices:
-        # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the
-        # ranks will first create the document files, and then read it.
+        # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the 
+        # ranks will first create the document files, and then read it. 
         # There will not be contension effects going on either
-        log.warning(
-            f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}"
-        )
+        print_rank_0(f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}")
 
         # For the last epoch, decide whether include the entire epoch
         # in the global shuffle or not.
@@ -851,80 +612,64 @@ def _build_index_mappings(
         # not mean anything.
         if num_epochs == 1:
             separate_last_epoch = False
-            log.debug(
-                " > only one epoch required, setting " "separate_last_epoch to False"
-            )
+            print_rank_0(' > only one epoch required, setting '
+                  'separate_last_epoch to False')
 
         else:
             # Get the number of samples for the last epoch
             num_samples_from_epochs_minus_one = (
-                (num_epochs - 1) * tokens_per_epoch - 1
-            ) // seq_length
-            last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one
-            assert (
-                last_epoch_num_samples >= 0
-            ), "last epoch number of samples should be non-negative."
+                (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
+            last_epoch_num_samples = num_samples - \
+                                     num_samples_from_epochs_minus_one
+            assert last_epoch_num_samples >= 0, \
+                'last epoch number of samples should be non-negative.'
             num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
-            assert last_epoch_num_samples <= (
-                num_samples_per_epoch + 1
-            ), "last epoch number of samples exceeded max value."
+            assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \
+                'last epoch number of samples exceeded max value.'
             # If we have less than 80% of the samples for the last epoch,
             # seperate out the epoch and treat it differently.
             # Note: the 80% number is just based on common sense and can
             # be adjusted if needed.
-            separate_last_epoch = last_epoch_num_samples < int(
-                0.80 * num_samples_per_epoch
-            )
+            separate_last_epoch = (last_epoch_num_samples <
+                                   int(0.80 * num_samples_per_epoch))
             if separate_last_epoch:
-                string = (
-                    " > last epoch number of samples ({}) is smaller "
-                    "than 80% of number of samples per epoch ({}), "
-                    "setting separate_last_epoch to True"
-                )
+                string = ' > last epoch number of samples ({}) is smaller '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to True'
             else:
-                string = (
-                    " > last epoch number of samples ({}) is larger "
-                    "than 80% of number of samples per epoch ({}), "
-                    "setting separate_last_epoch to False"
-                )
-            log.debug(string.format(last_epoch_num_samples, num_samples_per_epoch))
+                string = ' > last epoch number of samples ({}) is larger '\
+                         'than 80% of number of samples per epoch ({}), '\
+                         'setting separate_last_epoch to False'
+            print_rank_0(string.format(last_epoch_num_samples,
+                                num_samples_per_epoch))
+
 
         try:
             os.makedirs(data_cache_dir, exist_ok=True)
 
             # description
-            with open(idx_path["desc"], "wt") as fd:
+            with open(idx_path['desc'], 'wt') as fd:
                 fd.write(desc)
 
             # doc-idx.
             start_time = time.time()
-            doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch)
-            np.save(idx_path["doc"], doc_idx, allow_pickle=True)
-            log.debug(
-                " > elasped time to build and save doc-idx mapping "
-                "(seconds): {:4f}".format(time.time() - start_time)
-            )
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
+                                     separate_last_epoch)
+            np.save(idx_path['doc'], doc_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save doc-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
             # sample-idx.
             start_time = time.time()
             # Use C++ implementation for speed.
             # First compile and then import.
             from megatron.data import helpers
-
             assert doc_idx.dtype == np.int32
             assert sizes.dtype == np.int32
-            sample_idx = helpers.build_sample_idx(
-                sizes,
-                doc_idx,
-                seq_length,
-                num_epochs,
-                tokens_per_epoch,
-                torch.distributed.get_rank() == 0,
-            )
-            np.save(idx_path["sample"], sample_idx, allow_pickle=True)
-            log.debug(
-                " > elasped time to build and save sample-idx mapping "
-                "(seconds): {:4f}".format(time.time() - start_time)
-            )
+            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
+                                                  num_epochs, tokens_per_epoch, torch.distributed.get_rank()==0)
+            np.save(idx_path['sample'], sample_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save sample-idx mapping '
+                         '(seconds): {:4f}'.format(time.time() - start_time))
             # shuffle-idx.
             start_time = time.time()
             # -1 is due to data structure used to retieve the index:
@@ -933,46 +678,35 @@ def _build_index_mappings(
                 num_samples_ = num_samples_from_epochs_minus_one
             else:
                 num_samples_ = sample_idx.shape[0] - 1
-            shuffle_idx = _build_shuffle_idx(
-                num_samples_, sample_idx.shape[0] - 1, np_rng
-            )
-            np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True)
-            log.debug(
-                " > elasped time to build and save shuffle-idx mapping"
-                " (seconds): {:4f}".format(time.time() - start_time)
-            )
+            shuffle_idx = _build_shuffle_idx(num_samples_,
+                                             sample_idx.shape[0] - 1, np_rng)
+            np.save(idx_path['shuffle'], shuffle_idx, allow_pickle=True)
+            print_rank_0(' > elasped time to build and save shuffle-idx mapping'
+                         ' (seconds): {:4f}'.format(time.time() - start_time))
         except OSError:
-            print(
-                f"There was an error trying to create the data cache directory ({data_cache_dir})"
-            )
-            print(
-                'or a file in it. This defaults to a directory "index-cache" within the directory'
-            )
-            print(
-                "the data files are in and can be set with the --data-cache-path argument. Please"
-            )
-            print(
-                "ensure you have write access to this directory or specify one that you do have"
-            )
-            print("write access to.")
+            print(f'There was an error trying to create the data cache directory ({data_cache_dir})')
+            print('or a file in it. This defaults to a directory "index-cache" within the directory')
+            print('the data files are in and can be set with the --data-cache-path argument. Please')
+            print('ensure you have write access to this directory or specify one that you do have')
+            print('write access to.')
             data_cache_success = False
 
     # Load mappings.
     start_time = time.time()
-    log.debug(f" > loading doc-idx mapping from {idx_path['doc']}")
-    doc_idx = np.load(idx_path["doc"], allow_pickle=True, mmap_mode="r")
+    print_rank_0(f" > loading doc-idx mapping from {idx_path['doc']}")
+    doc_idx = np.load(idx_path['doc'], allow_pickle=True, mmap_mode='r')
 
-    log.debug(f" > loading sample-idx mapping from {idx_path['sample']}")
-    sample_idx = np.load(idx_path["sample"], allow_pickle=True, mmap_mode="r")
+    print_rank_0(f" > loading sample-idx mapping from {idx_path['sample']}")
+    sample_idx = np.load(idx_path['sample'], allow_pickle=True, mmap_mode='r')
 
-    log.debug(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
-    shuffle_idx = np.load(idx_path["shuffle"], allow_pickle=True, mmap_mode="r")
+    print_rank_0(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
+    shuffle_idx = np.load(idx_path['shuffle'], allow_pickle=True, mmap_mode='r')
 
-    log.debug(
-        "    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time)
-    )
-    log.debug("    total number of samples: {}".format(sample_idx.shape[0]))
-    log.debug("    total number of epochs: {}".format(num_epochs))
+    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
+        time.time() - start_time))
+    print_rank_0('    total number of samples: {}'.format(
+        sample_idx.shape[0]))
+    print_rank_0('    total number of epochs: {}'.format(num_epochs))
 
     return doc_idx, sample_idx, shuffle_idx, desc, desc_hash
 
@@ -996,26 +730,25 @@ def _num_epochs(tokens_per_epoch, seq_length, num_samples):
         if ((total_tokens - 1) // seq_length) >= num_samples:
             return num_epochs
 
-
 @dlp.log
 def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
     """Build an array with length = number-of-epochs * number-of-dcuments.
     Each index is mapped to a corresponding document."""
     if not separate_last_epoch or num_epochs == 1:
-        doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
+        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
         doc_idx[:] = documents
         doc_idx = doc_idx.reshape(-1)
         doc_idx = doc_idx.astype(np.int32)
         np_rng.shuffle(doc_idx)
         return doc_idx
 
-    doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False)
+    doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False)
     doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
     return np.concatenate((doc_idx_first, doc_idx_last))
 
-
 @dlp.log
-def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
+def _build_sample_idx(sizes, doc_idx, seq_length,
+                      num_epochs, tokens_per_epoch):
     """Sample index mapping is a 2D array with sizes
     [number-of-samples + 1, 2] where [..., 0] contains
     the index into `doc_idx` and [..., 1] is the
@@ -1049,7 +782,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
             # Note that -1 here is for the same reason we have -1 in
             # `_num_epochs` calculations.
             if remaining_seq_length <= 0:
-                doc_offset += remaining_seq_length + doc_length - 1
+                doc_offset += (remaining_seq_length + doc_length - 1)
                 remaining_seq_length = 0
             else:
                 # Otherwise, start from the begining of the next document.
@@ -1062,28 +795,24 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
 
     return sample_idx
 
-
 @dlp.log
 def _build_shuffle_idx(num_samples, total_size, np_rng):
     """Build the range [0, size) and shuffle."""
-    log.debug(
-        " > building shuffle index with split [0, {}) and [{}, {}) " "...".format(
-            num_samples, num_samples, total_size
-        )
-    )
+    print_rank_0(' > building shuffle index with split [0, {}) and [{}, {}) '
+          '...'.format(num_samples, num_samples, total_size))
 
     dtype_ = np.uint32
     if total_size >= (np.iinfo(np.uint32).max - 1):
         dtype_ = np.int64
 
-    shuffle_idx_first = np.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
+    shuffle_idx_first = np.arange(start=0, stop=num_samples,
+                                  step=1, dtype=dtype_)
     np_rng.shuffle(shuffle_idx_first)
     if num_samples == total_size:
         return shuffle_idx_first
 
-    shuffle_idx_last = np.arange(
-        start=num_samples, stop=total_size, step=1, dtype=dtype_
-    )
+    shuffle_idx_last = np.arange(start=num_samples, stop=total_size,
+                                 step=1, dtype=dtype_)
     np_rng.shuffle(shuffle_idx_last)
 
     return np.concatenate((shuffle_idx_first, shuffle_idx_last))

From 52a406ce7005f7be260462688020f325190a48b8 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Mon, 14 Oct 2024 23:51:07 -0500
Subject: [PATCH 18/31] Consistent logging in `megatron/data/*.py`

---
 megatron/data/blendable_dataset.py |  14 +-
 megatron/data/gpt_dataset.py       | 907 ++++++++++++++++++-----------
 megatron/data/indexed_dataset.py   |  16 -
 3 files changed, 590 insertions(+), 347 deletions(-)
 mode change 100755 => 100644 megatron/data/gpt_dataset.py

diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py
index 590a379971d..979e9a174e7 100755
--- a/megatron/data/blendable_dataset.py
+++ b/megatron/data/blendable_dataset.py
@@ -49,8 +49,10 @@ def _build_indices():
             helpers.build_blending_indices(dataset_index, dataset_sample_index,
                                            weights, num_datasets, self.size,
                                            torch.distributed.get_rank() == 0)
-            log.info('> elapsed time for building blendable dataset indices: '
-                         '{:.2f} (sec)'.format(time.time() - start_time))
+            log.info(
+                "> elapsed time for building blendable dataset indices: "
+                f"{time.perf_counter() - start_time:.2f} (sec)"
+            )
             return dataset_index, dataset_sample_index
 
         desc = "Blendable dataset\n\n"
@@ -74,7 +76,7 @@ def _build_indices():
                       ' dataset, building indices on rank 0 ...', flush=True)
                 dataset_index, dataset_sample_index = _build_indices()
                 try:
-                    log.info(" > saving index map files")
+                    log.debug(" > saving index map files")
                     start_time = time.time()
                     os.makedirs(os.path.dirname(index_path), exist_ok=True)
                     with open(desc_path, 'wt') as fd:
@@ -105,7 +107,7 @@ def _build_indices():
             torch.distributed.barrier(group=mpu.get_data_parallel_group())
             torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group())
             torch.distributed.barrier(group=mpu.get_data_parallel_group())
-            
+
             start_time = time.time()
             log.info(f'> loading blendable dataset index: {index_path}')
             self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r')
@@ -113,7 +115,7 @@ def _build_indices():
             log.info(f'> loading blendable dataset sample index: {sample_index_path}')
             self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r')
             assert self.dataset_sample_index.size == self.size
-            log.info(f'> finished loading in {time.time() - start_time} seconds')            
+            log.info(f'> finished loading in {time.time() - start_time} seconds')
         else:
             self.dataset_index, self.dataset_sample_index = _build_indices()
 
@@ -139,4 +141,4 @@ def __getitem__(self, idx):
         return {
             "dataset_idx" : dataset_idx,
             **self.datasets[dataset_idx][sample_idx],
-        }            
+        }
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
old mode 100755
new mode 100644
index 0cf97356a41..c412d02b31c
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -9,67 +9,96 @@
 import numpy as np
 import torch
 from deepspeed.accelerator import get_accelerator
-from megatron import print_rank_0, is_rank_0, get_args
+from megatron import is_rank_0, get_args
 from megatron.core import mpu
-from megatron.data import helpers
+from megatron.data import helpers  # type:ignore
 from megatron.data.blendable_dataset import BlendableDataset
-from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_datasets_corpuses_weights_and_num_samples
+from megatron.data.dataset_utils import (
+    get_datasets_weights_and_num_samples,
+    get_datasets_corpuses_weights_and_num_samples,
+)
 from megatron.data.dataset_utils import get_train_valid_test_split_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 
-from megatron.utils import PerfTrace, Profile
+from megatron.utils import PerfTrace, Profile, get_logger
 from mpi4py import MPI
 
 dlp = Profile("DATASET")
 
+log = get_logger(__name__, rank_zero_only=True)
+
+
 @dlp.log
-def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
-                                    train_valid_test_num_samples,
-                                    seq_length, seed, skip_warmup,
-                                    train_data_prefix=None,
-                                    valid_data_prefix=None,
-                                    test_data_prefix=None,
-                                    return_doc_ids=False, *,
-                                    data_cache_path=None):
+def build_train_valid_test_datasets(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_valid_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    train_data_prefix=None,
+    valid_data_prefix=None,
+    test_data_prefix=None,
+    return_doc_ids=False,
+    *,
+    data_cache_path=None,
+):
     """Build train, valid, and test datasets."""
 
     if data_prefix:
-        print_rank_0("Single data path provided for train, valid & test")
+        log.debug("Single data path provided for train, valid & test")
 
         # Single dataset.
         if len(data_prefix) == 1:
-            return _build_train_valid_test_datasets(data_prefix[0],
-                                                    data_impl, splits_string,
-                                                    train_valid_test_num_samples,
-                                                    seq_length, seed, skip_warmup,
-                                                    data_cache_path=data_cache_path)
+            return _build_train_valid_test_datasets(
+                data_prefix[0],
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples,
+                seq_length,
+                seed,
+                skip_warmup,
+                data_cache_path=data_cache_path,
+            )
 
         # Blending dataset.
         # Parse the values.
-        output = get_datasets_corpuses_weights_and_num_samples(data_prefix,
-                                                      train_valid_test_num_samples)
+        output = get_datasets_corpuses_weights_and_num_samples(
+            data_prefix, train_valid_test_num_samples
+        )
         prefixes, corpuses, weights, datasets_train_valid_test_num_samples = output
         corpus_list = sorted(set(corpuses))
         train_num_samples, valid_num_samples, test_num_samples = map(
-            sum,
-            zip(*datasets_train_valid_test_num_samples)
+            sum, zip(*datasets_train_valid_test_num_samples)
         )
 
         class DatasetBuilder:
-            ''' 
+            """
             This is for building individual dataset from each dataset file
-            '''
+            """
+
             @dlp.log
-            def __init__(self, prefix, corpus, data_impl, splits_string,
-                         num_samples, seq_length, seed, skip_warmup,
-                         return_doc_ids,
-                         data_cache_path=data_cache_path, name='train'):
+            def __init__(
+                self,
+                prefix,
+                corpus,
+                data_impl,
+                splits_string,
+                num_samples,
+                seq_length,
+                seed,
+                skip_warmup,
+                return_doc_ids,
+                data_cache_path=data_cache_path,
+                name="train",
+            ):
                 self.prefix = prefix
                 self.data_impl = data_impl
                 self.splits_string = splits_string
-                if name == 'train':
+                if name == "train":
                     self.num_samples = num_samples[0]
-                elif name == 'valid':
+                elif name == "valid":
                     self.num_samples = num_samples[1]
                 else:
                     self.num_samples = num_samples[2]
@@ -84,279 +113,413 @@ def __init__(self, prefix, corpus, data_impl, splits_string,
                 self.desc = prefix + f"{self.num_samples}" + f"{seq_length}" + f"{seed}"
                 self.build = False
                 self.corpus = corpus
+
             @dlp.log
             def Build(self):
-                self.dataset = _build_train_valid_test_datasets_single(self.prefix, self.data_impl, self.splits_string,
-                    self.num_samples_train_valid_test, self.seq_length, self.seed, self.skip_warmup, self.name, self.return_doc_ids, 
-                    data_cache_path=self.data_cache_path)
+                self.dataset = _build_train_valid_test_datasets_single(
+                    self.prefix,
+                    self.data_impl,
+                    self.splits_string,
+                    self.num_samples_train_valid_test,
+                    self.seq_length,
+                    self.seed,
+                    self.skip_warmup,
+                    self.name,
+                    self.return_doc_ids,
+                    data_cache_path=self.data_cache_path,
+                )
                 self.build = True
                 return self.dataset
 
         class BuildConcatDataset(torch.utils.data.Dataset):
             @dlp.log
-            def __init__(self, dataset_builders):
+            def __init__(self, dataset_builders, shuffle=False):
                 self.dataset_builders = dataset_builders
                 self.num_datasets = len(dataset_builders)
                 self.num_samples = np.sum([d.num_samples for d in dataset_builders])
-                self.indices=np.zeros((self.num_samples, 2), dtype=np.uint64)
-                self.desc="ConcatDataset:"
-                m = 0
+                self.indices = np.zeros((self.num_samples, 2), dtype=np.uint64)
+                self.desc = "ConcatDataset:"
+                # m = 0
                 num_samples_list = np.array([d.num_samples for d in dataset_builders])
                 self.num_samples = np.sum(num_samples_list)
+
                 def _build_indices():
                     start_time = time.time()
                     dataset_index = np.zeros(self.num_samples, dtype=np.int64)
                     dataset_sample_index = np.zeros(self.num_samples, dtype=np.int64)
-                    helpers.build_concat_indices(dataset_index, dataset_sample_index,
-                                                 num_samples_list, 
-                                                 self.num_datasets, 
-                                                 torch.distributed.get_rank()==0)
-                    print_rank_0('> elapsed time for building concat dataset indices: '
-                                 '{:.2f} (sec)'.format(time.time() - start_time))
+                    helpers.build_concat_indices(
+                        dataset_index,
+                        dataset_sample_index,
+                        num_samples_list,
+                        self.num_datasets,
+                        torch.distributed.get_rank() == 0,
+                    )
+                    log.debug(
+                        "> elapsed time for building concat dataset indices: "
+                        "{:.2f} (sec)".format(time.time() - start_time)
+                    )
                     return dataset_index, dataset_sample_index
-                
+
                 self.dataset_index, self.dataset_sample_index = _build_indices()
+                np_rng = np.random.RandomState(seed=dataset_builders[0].seed)
+                self.shuffle_index = np.arange(self.num_samples)
+                if shuffle:
+                    np_rng.shuffle(self.shuffle_index)
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 
-                self.desc += f"-{self.num_samples}" + f"-{dataset_builders[0].seq_length}" + f"{dataset_builders[0].seed}"
+                self.desc += (
+                    f"-{self.num_samples}"
+                    + f"-{dataset_builders[0].seq_length}"
+                    + f"{dataset_builders[0].seed}"
+                )
+
             def __len__(self):
                 return self.num_samples
 
             @dlp.log
             def __getitem__(self, idx):
-                if idx >= self.num_samples:
-                    print_rank_0(f"WARNING: index overflow encountered {idx} > {self.num_samples} for {self.dataset_builders[0].corpus}; will randomly pick one sample")
-                    id = np.random.randint(self.num_samples)
-                else:
-                    id = idx
-                i = self.dataset_index[idx]
-                j = self.dataset_sample_index[idx]
+                id_shuffle = self.shuffle_index[idx]
+                i = self.dataset_index[id_shuffle]
+                j = self.dataset_sample_index[id_shuffle]
                 if self.dataset_builders[i].build:
                     return self.dataset_builders[i].dataset[j]
                 else:
                     return self.dataset_builders[i].Build()[j]
-            
 
-        # Predetermine whether need to build the specific dataset or not. 
+        # Predetermine whether need to build the specific dataset or not.
         start_time = time.time()
-        print_rank_0(" >>> Started building datasets in distributed way ... ")
+        log.debug(" >>> Started building datasets in distributed way ... ")
 
         a, b, c = [int(d) for d in splits_string.split(",")]
-        
+
         train_datasets = []
         valid_datasets = []
         test_datasets = []
         # Build individual datasets.
-
+        args = get_args()
         @dlp.log
-        def build_corpus_datasets(dataset_type='train'):
+        def build_corpus_datasets(dataset_type="train"):
             start_time = time.time()
-            print_rank_0(f" >>> Building {dataset_type} corpus datasets ...")
+            log.debug(f" >>> Building {dataset_type} corpus datasets ...")
             datasets = []
             corpus_builders = {}
             corpus_weights = {}
             for c in corpus_list:
                 corpus_builders[c] = []
                 corpus_weights[c] = 0.0
-            dataset_builders = [DatasetBuilder(prefixes[i], corpuses[i], data_impl, splits_string,
-                                               datasets_train_valid_test_num_samples[i],
-                                               seq_length, seed, skip_warmup,
-                                               return_doc_ids,data_cache_path, dataset_type) for i in  range(len(weights))]
-            for i in range(torch.distributed.get_rank()//mpu.get_tensor_model_parallel_world_size(), len(weights),  torch.distributed.get_world_size()//mpu.get_tensor_model_parallel_world_size()):
+            dataset_builders = [
+                DatasetBuilder(
+                    prefixes[i],
+                    corpuses[i],
+                    data_impl,
+                    splits_string,
+                    datasets_train_valid_test_num_samples[i],
+                    seq_length,
+                    seed,
+                    skip_warmup,
+                    return_doc_ids,
+                    data_cache_path,
+                    dataset_type,
+                )
+                for i in range(len(weights))
+            ]
+            for i in range(
+                torch.distributed.get_rank()
+                // mpu.get_tensor_model_parallel_world_size(),
+                len(weights),
+                torch.distributed.get_world_size()
+                // mpu.get_tensor_model_parallel_world_size(),
+            ):
                 dataset_builders[i].Build()
-            print_rank_0(f" >>> Finished building individual datasets in {time.time() - start_time} seconds")
+            log.debug(
+                f" >>> Finished building individual datasets in {time.time() - start_time} seconds"
+            )
             start_concating_time = time.time()
             for i, d in zip(range(len(weights)), dataset_builders):
                 corpus_builders[d.corpus].append(d)
                 corpus_weights[d.corpus] += weights[i]
             total = 0
-            print_rank_0(" > number of samples for each corpus ")
-            corpus_weights_achieved={}
+            log.debug(" > number of samples for each corpus ")
+            corpus_weights_achieved = {}
             for c in corpus_list:
-                datasets.append(BuildConcatDataset(corpus_builders[c]))
+                datasets.append(BuildConcatDataset(corpus_builders[c], args.shuffle_sample))
                 total += datasets[-1].num_samples
-                corpus_weights_achieved[c] =  float(datasets[-1].num_samples)/train_num_samples                
-                print_rank_0(f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})")
-            
-            print_rank_0(f" > total number of samples: {total}")
-            print_rank_0(f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds")
-            print_rank_0(f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds")
+                corpus_weights_achieved[c] = (
+                    float(datasets[-1].num_samples) / train_num_samples
+                )
+                log.debug(
+                    f"    {c}: {datasets[-1].num_samples} w={corpus_weights_achieved[c]} (expected: {corpus_weights[c]})"
+                )
+
+            log.debug(f" > total number of samples: {total}")
+            log.debug(
+                f" >>> Finished concatenating datasets in {time.time() - start_concating_time} seconds"
+            )
+            log.debug(
+                f" >>> Finished building {dataset_type} corpus datasets in {time.time() - start_time} seconds"
+            )
             return datasets, [corpus_weights_achieved[c] for c in corpus_list]
 
+        train_weights = None
         if a > 0:
-            train_datasets, train_weights = build_corpus_datasets('train')
-
+            train_datasets, train_weights = build_corpus_datasets("train")
+        valid_weights = None
         if b > 0:
-            valid_datasets, valid_weights = build_corpus_datasets('valid')
-            
-        if c > 0:            
-            test_datasets, test_weights = build_corpus_datasets('test')
+            valid_datasets, valid_weights = build_corpus_datasets("valid")
+        test_weights = None
+        if c > 0:
+            test_datasets, test_weights = build_corpus_datasets("test")
 
         # This barrier is critical to make sure that all the datasets are built once
         # and the metadata were written to the cache folder before other ranks touch them
-        print_rank_0(f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds")
+        log.debug(
+            f" >>> Rank 0 - finished building datasets in {time.time() - start_time} seconds"
+        )
         torch.distributed.barrier(group=mpu.get_data_parallel_group())
         torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group())
         torch.distributed.barrier(group=mpu.get_data_parallel_group())
-        print_rank_0(f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds")
-        print_rank_0(f" >>> Starting to build BlendableDataset")
+        log.debug(
+            f" >>> Finished building datasets (all ranks) in distributed way in {time.time() - start_time} seconds"
+        )
+        log.debug(" >>> Starting to build BlendableDataset")
         # Blend.
         start_time = time.time()
         blending_train_dataset = None
-        if train_datasets:
-            blending_train_dataset = BlendableDataset(train_datasets, train_weights, train_num_samples,
-                                                      data_cache_path=data_cache_path)
+        if train_datasets and train_weights:
+            blending_train_dataset = BlendableDataset(
+                train_datasets,
+                train_weights,
+                train_num_samples,
+                data_cache_path=data_cache_path,
+            )
         blending_valid_dataset = None
-        if valid_datasets:
-            blending_valid_dataset = BlendableDataset(valid_datasets, valid_weights, valid_num_samples,
-                                                      data_cache_path=data_cache_path)
+        if valid_datasets and valid_weights:
+            blending_valid_dataset = BlendableDataset(
+                valid_datasets,
+                valid_weights,
+                valid_num_samples,
+                data_cache_path=data_cache_path,
+            )
         blending_test_dataset = None
-        if test_datasets:
-            blending_test_dataset = BlendableDataset(test_datasets, test_weights, test_num_samples,
-                                                     data_cache_path=data_cache_path)
+        if test_datasets and test_weights:
+            blending_test_dataset = BlendableDataset(
+                test_datasets,
+                test_weights,
+                test_num_samples,
+                data_cache_path=data_cache_path,
+            )
         end_time = time.time()
-        print_rank_0(f" >>> Finished building BlendableDataset in {end_time - start_time} seconds")
-        return (blending_train_dataset, blending_valid_dataset,
-                blending_test_dataset)
+        log.debug(
+            f" >>> Finished building BlendableDataset in {end_time - start_time} seconds"
+        )
+        return (blending_train_dataset, blending_valid_dataset, blending_test_dataset)
 
     else:
-        print_rank_0("Separate data paths provided for train, valid & test. Split string will be ignored.")
+        log.debug(
+            "Separate data paths provided for train, valid & test. Split string will be ignored."
+        )
 
         train_dataset, valid_dataset, test_dataset = None, None, None
         # Single dataset.
         if train_data_prefix is not None:
-            train_dataset = build_dataset("train", train_data_prefix, data_impl,
-                                          splits_string,
-                                          train_valid_test_num_samples[0],
-                                          seq_length, seed, skip_warmup,
-                                          data_cache_path=data_cache_path)
+            train_dataset = build_dataset(
+                "train",
+                train_data_prefix,
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples[0],
+                seq_length,
+                seed,
+                skip_warmup,
+                data_cache_path=data_cache_path,
+            )
 
         if valid_data_prefix is not None:
-            valid_dataset = build_dataset("valid", valid_data_prefix, data_impl,
-                                          splits_string,
-                                          train_valid_test_num_samples[1],
-                                          seq_length, seed, False,
-                                          data_cache_path=data_cache_path)
-
+            valid_dataset = build_dataset(
+                "valid",
+                valid_data_prefix,
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples[1],
+                seq_length,
+                seed,
+                False,
+                data_cache_path=data_cache_path,
+            )
 
         if test_data_prefix is not None:
-            test_dataset = build_dataset("test", test_data_prefix, data_impl,
-                                         splits_string,
-                                         train_valid_test_num_samples[2],
-                                         seq_length, seed, False,
-                                         data_cache_path=data_cache_path)
+            test_dataset = build_dataset(
+                "test",
+                test_data_prefix,
+                data_impl,
+                splits_string,
+                train_valid_test_num_samples[2],
+                seq_length,
+                seed,
+                False,
+                data_cache_path=data_cache_path,
+            )
 
         return (train_dataset, valid_dataset, test_dataset)
 
+
 @dlp.log
-def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
-                                     train_valid_test_num_samples,
-                                     seq_length, seed, skip_warmup,
-                                     return_doc_ids=False, *,
-                                     data_cache_path=None):
+def _build_train_valid_test_datasets(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_valid_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    return_doc_ids=False,
+    *,
+    data_cache_path=None,
+):
     """Build train, valid, and test datasets."""
 
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
-    print_rank_0(' > dataset split:')
+    log.debug(" > dataset split:")
 
     def print_split_stats(name, index):
-        print_rank_0('    {}:'.format(name))
-        print_rank_0('     document indices in [{}, {}) total of {} '
-                     'documents'.format(splits[index], splits[index + 1],
-                                        splits[index + 1] - splits[index]))
-    print_split_stats('train', 0)
-    print_split_stats('validation', 1)
-    print_split_stats('test', 2)
+        log.debug("    {}:".format(name))
+        log.debug(
+            "     document indices in [{}, {}) total of {} " "documents".format(
+                splits[index], splits[index + 1], splits[index + 1] - splits[index]
+            )
+        )
+
+    print_split_stats("train", 0)
+    print_split_stats("validation", 1)
+    print_split_stats("test", 2)
 
     def build_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index + 1],
-                                  step=1, dtype=np.int32)
-            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
-                                 splits_string,
-                                 train_valid_test_num_samples[index],
-                                 seq_length, seed,
-                                 return_doc_ids,
-                                 data_cache_path=data_cache_path)
+            documents = np.arange(
+                start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
+            )
+            dataset = GPTDataset(
+                name,
+                data_prefix,
+                documents,
+                indexed_dataset,
+                splits_string,
+                train_valid_test_num_samples[index],
+                seq_length,
+                seed,
+                return_doc_ids,
+                data_cache_path=data_cache_path,
+            )
         return dataset
 
-    train_dataset = build_dataset(0, 'train')
-    valid_dataset = build_dataset(1, 'valid')
-    test_dataset = build_dataset(2, 'test')
+    train_dataset = build_dataset(0, "train")
+    valid_dataset = build_dataset(1, "valid")
+    test_dataset = build_dataset(2, "test")
 
     return (train_dataset, valid_dataset, test_dataset)
 
+
 @dlp.log
-def _build_train_valid_test_datasets_single(data_prefix, data_impl, splits_string,
-                            train_valid_test_num_samples,
-                            seq_length, seed, skip_warmup, name, 
-                            return_doc_ids=False, *,
-                            data_cache_path=None):
+def _build_train_valid_test_datasets_single(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_valid_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    name,
+    return_doc_ids=False,
+    *,
+    data_cache_path=None,
+):
     """Build train, valid, and test datasets."""
 
     # Each rank print out information
-    print_rank_0(f" >> building dataset for {data_prefix}")
+    log.debug(f" >> building dataset for {data_prefix}")
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
-    print_rank_0(' > dataset split:')
+    log.debug(" > dataset split:")
 
     def print_split_stats(name, index):
-        print_rank_0('    {}:'.format(name))
-        print_rank_0('     document indices in [{}, {}) total of {} '
-                     'documents'.format(splits[index], splits[index + 1],
-                                        splits[index + 1] - splits[index]))
-    print_split_stats('train', 0)
-    print_split_stats('validation', 1)
-    print_split_stats('test', 2)
+        log.debug("    {}:".format(name))
+        log.debug(
+            "     document indices in [{}, {}) total of {} " "documents".format(
+                splits[index], splits[index + 1], splits[index + 1] - splits[index]
+            )
+        )
+
+    print_split_stats("train", 0)
+    print_split_stats("validation", 1)
+    print_split_stats("test", 2)
 
     def build_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index + 1],
-                                  step=1, dtype=np.int32)
-            dataset = GPTDataset(name, data_prefix, documents, indexed_dataset,
-                                 splits_string,
-                                 train_valid_test_num_samples[index],
-                                 seq_length, seed,
-                                 return_doc_ids,
-                                 data_cache_path=data_cache_path)
+            documents = np.arange(
+                start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
+            )
+            dataset = GPTDataset(
+                name,
+                data_prefix,
+                documents,
+                indexed_dataset,
+                splits_string,
+                train_valid_test_num_samples[index],
+                seq_length,
+                seed,
+                return_doc_ids,
+                data_cache_path=data_cache_path,
+            )
         return dataset
-    if name.find("train")!=-1:
-        return build_dataset(0, 'train')
-    if name.find("valid")!=-1:
-        return build_dataset(1, 'valid')
-    if name.find("test")!=-1:
-        return build_dataset(2, 'test')
+
+    if name.find("train") != -1:
+        return build_dataset(0, "train")
+    if name.find("valid") != -1:
+        return build_dataset(1, "valid")
+    if name.find("test") != -1:
+        return build_dataset(2, "test")
+
 
 @dlp.log
-def build_dataset(dataset_name, data_prefix, data_impl,
-                  splits_string, num_samples,
-                  seq_length, seed, skip_warmup,
-                  *,
-                  data_cache_path=None):
+def build_dataset(
+    dataset_name,
+    data_prefix,
+    data_impl,
+    splits_string,
+    num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    *,
+    data_cache_path=None,
+):
     dataset = None
     if len(data_prefix) == 1:
-        dataset = _build_dataset(dataset_name, data_prefix[0], data_impl,
-                                 splits_string, num_samples, seq_length,
-                                 seed, skip_warmup,
-                                 data_cache_path=data_cache_path)
+        dataset = _build_dataset(
+            dataset_name,
+            data_prefix[0],
+            data_impl,
+            splits_string,
+            num_samples,
+            seq_length,
+            seed,
+            skip_warmup,
+            data_cache_path=data_cache_path,
+        )
     else:
         # Blending dataset.
         # Parse the values.
@@ -367,73 +530,108 @@ def build_dataset(dataset_name, data_prefix, data_impl,
         # Build individual datasets.
         datasets = []
         for i in range(len(prefixes)):
-            ds = _build_dataset(dataset_name, prefixes[i], data_impl,
-                                splits_string, dataset_num_samples[i],
-                                seq_length, seed, skip_warmup,
-                                data_cache_path=data_cache_path)
+            ds = _build_dataset(
+                dataset_name,
+                prefixes[i],
+                data_impl,
+                splits_string,
+                dataset_num_samples[i],
+                seq_length,
+                seed,
+                skip_warmup,
+                data_cache_path=data_cache_path,
+            )
             if ds:
                 datasets.append(ds)
 
         if datasets:
-            dataset = BlendableDataset(datasets, weights, num_samples,
-                                       data_cache_path=data_cache_path)
+            dataset = BlendableDataset(
+                datasets, weights, num_samples, data_cache_path=data_cache_path
+            )
 
     return dataset
 
+
 @dlp.log
-def _build_dataset(dataset_name, data_prefix, data_impl, splits_string,
-                   num_samples, seq_length, seed, skip_warmup,
-                   *,
-                   data_cache_path=None):
+def _build_dataset(
+    dataset_name,
+    data_prefix,
+    data_impl,
+    splits_string,
+    num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    *,
+    data_cache_path=None,
+):
     """
     Build dataset. This method is called when individual
     train, valid, test datasets are provided
     """
 
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
+    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
 
-    print_rank_0('    {}:'.format(dataset_name))
-    print_rank_0('     document indices in [0, {}) total of {} '
-                 'documents'.format(total_num_of_documents, total_num_of_documents))
-
-    documents = np.arange(start=0, stop=total_num_of_documents,
-                        step=1, dtype=np.int32)
-
-    dataset = GPTDataset(dataset_name, data_prefix, documents, indexed_dataset,
-                         splits_string, num_samples, seq_length, seed,
-                         data_cache_path=data_cache_path)
+    log.debug("    {}:".format(dataset_name))
+    log.debug(
+        "     document indices in [0, {}) total of {} " "documents".format(
+            total_num_of_documents, total_num_of_documents
+        )
+    )
+
+    documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32)
+
+    dataset = GPTDataset(
+        dataset_name,
+        data_prefix,
+        documents,
+        indexed_dataset,
+        splits_string,
+        num_samples,
+        seq_length,
+        seed,
+        data_cache_path=data_cache_path,
+    )
 
     return dataset
 
+
 @dlp.log
 def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
     """Build indexed dataset."""
-    print_rank_0(' > building dataset index ...')
+    log.debug(" > building dataset index ...")
 
     start_time = time.time()
-    indexed_dataset = make_indexed_dataset(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-    print_rank_0(' > finished creating indexed dataset in {:4f} '
-                 'seconds'.format(time.time() - start_time))
-    print_rank_0('    number of documents: {}'.format(
-        indexed_dataset.sizes.shape[0]))
+    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
+    log.debug(
+        " > finished creating indexed dataset in {:4f} " "seconds".format(
+            time.time() - start_time
+        )
+    )
+    log.debug("    number of documents: {}".format(indexed_dataset.sizes.shape[0]))
 
     return indexed_dataset
 
 
 class GPTDataset(torch.utils.data.Dataset):
     @dlp.log
-    def __init__(self, name, data_prefix, documents, indexed_dataset,
-                 splits_string, num_samples, seq_length, seed,
-                 return_doc_ids=False, *,
-                 data_cache_path=None):
-
+    def __init__(
+        self,
+        name,
+        data_prefix,
+        documents,
+        indexed_dataset,
+        splits_string,
+        num_samples,
+        seq_length,
+        seed,
+        return_doc_ids=False,
+        *,
+        data_cache_path=None,
+    ):
         self.name = name
         self.indexed_dataset = indexed_dataset
         self.return_doc_ids = return_doc_ids
@@ -443,20 +641,29 @@ def __init__(self, name, data_prefix, documents, indexed_dataset,
         assert np.max(documents) < indexed_dataset.sizes.shape[0]
 
         # Build index mappings.
-        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = \
-            _build_index_mappings(self.name, data_prefix,
-                                  documents, self.indexed_dataset.sizes,
-                                  splits_string, num_samples, seq_length, seed,
-                                  data_cache_path=data_cache_path)
-
+        self.doc_idx, self.sample_idx, self.shuffle_idx, self.desc, self.desc_hash = (
+            _build_index_mappings(
+                self.name,
+                data_prefix,
+                documents,
+                self.indexed_dataset.sizes,
+                splits_string,
+                num_samples,
+                seq_length,
+                seed,
+                data_cache_path=data_cache_path,
+            )
+        )
 
     def __len__(self):
         # -1 is due to data structure used to retieve the index:
         #    sample i --> [sample_idx[i], sample_idx[i+1])
         return self.sample_idx.shape[0] - 1
+
     @dlp.log
     def __getitem__(self, idx):
         args = get_args()
+        assert args is not None
         orig_idx = idx
         # Get the shuffled index.
         try:
@@ -465,21 +672,24 @@ def __getitem__(self, idx):
             if is_rank_0():
                 import json
                 from rich import print_json
+
                 print(exc)
                 print(
-                    '\n'.join(
-                        ['-------------------------------------------------',
-                         f'Trying to access {idx=} from self.shuffle_idx,',
-                         f'but {len(self.shuffle_idx)=}',
-                         '-------------------------------------------------']
+                    "\n".join(
+                        [
+                            "-------------------------------------------------",
+                            f"Trying to access {idx=} from self.shuffle_idx,",
+                            f"but {len(self.shuffle_idx)=}",
+                            "-------------------------------------------------",
+                        ]
                     )
                 )
                 print_json(
                     json.dumps(
                         {
-                            'doc_idx': len(self.doc_idx),
-                            'sample_idx': len(self.sample_idx),
-                            'shuffle_idx': len(self.shuffle_idx),
+                            "doc_idx": len(self.doc_idx),
+                            "sample_idx": len(self.sample_idx),
+                            "shuffle_idx": len(self.shuffle_idx),
                         },
                         indent=4,
                     )
@@ -493,45 +703,57 @@ def __getitem__(self, idx):
         doc_ids = []
         if doc_index_f == doc_index_l:
             doc_ids.append(self.doc_idx[doc_index_f])
-            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                              offset=offset_f,
-                                              length=offset_l - offset_f + 1)
+            sample = self.indexed_dataset.get(
+                self.doc_idx[doc_index_f],
+                offset=offset_f,
+                length=offset_l - offset_f + 1,
+            )
         else:
             # Otherwise, get the rest of the initial document.
             doc_ids.append(self.doc_idx[doc_index_f])
-            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                                    offset=offset_f)]
+            sample_list = [
+                self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
+            ]
             # Loop over all in between documents and add the entire document.
             for i in range(doc_index_f + 1, doc_index_l):
                 doc_ids.append(self.doc_idx[i])
                 sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
             # And finally add the relevant portion of last document.
             doc_ids.append(self.doc_idx[doc_index_l])
-            sample_list.append(self.indexed_dataset.get(
-                self.doc_idx[doc_index_l],
-                length=offset_l + 1))
+            sample_list.append(
+                self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
+            )
             sample = np.concatenate(sample_list)
 
-        text_name = 'text'
+        text_name = "text"
         if args.use_dataset_only:
-            text_name = 'input_ids'
+            text_name = "input_ids"
         sample_dict = {text_name: np.array(sample, dtype=np.int64)}
         if args.return_data_index:
-            sample_dict.update({'index': np.array([orig_idx], dtype=np.int64)})
+            sample_dict.update({"index": np.array([orig_idx], dtype=np.int64)})
 
-        if self.return_doc_ids: # for retro preprocessing
-            sample_dict.update({'doc_ids': np.array(doc_ids, dtype=np.int64)})
+        if self.return_doc_ids:  # for retro preprocessing
+            sample_dict.update({"doc_ids": np.array(doc_ids, dtype=np.int64)})
 
         if args.use_dataset_only:
-            sample_dict.update({'labels': np.array(sample, dtype=np.int64)})
+            sample_dict.update({"labels": np.array(sample, dtype=np.int64)})
 
         return sample_dict
 
+
 @dlp.log
-def _build_index_mappings(name, data_prefix, documents, sizes,
-                          splits_string, num_samples, seq_length, seed,
-                          *,
-                          data_cache_path):
+def _build_index_mappings(
+    name,
+    data_prefix,
+    documents,
+    sizes,
+    splits_string,
+    num_samples,
+    seq_length,
+    seed,
+    *,
+    data_cache_path,
+):
     """Build doc-idx, sample-idx, and shuffle-idx.
     doc-idx: is an array (ordered) of documents to be used in training.
     sample-idx: is the start document index and document offset for each
@@ -539,10 +761,11 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     shuffle-idx: maps the sample index into a random index into sample-idx.
     """
     args = get_args()
+    assert args is not None
     # Number of tokens in each epoch and number of required epochs.
     tokens_per_epoch = _num_tokens(documents, sizes)
     num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
-    if args.train_data_exact_num_epochs is not None and name == 'train':
+    if args.train_data_exact_num_epochs is not None and name == "train":
         num_epochs = args.train_data_exact_num_epochs
 
     # rng state
@@ -557,13 +780,13 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     desc += f"Sequence length {seq_length}\n"
     desc += f"Random seed {seed}\n"
     desc += f"Split {splits_string}\n"
-    desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
+    desc_hash = hashlib.md5(desc.encode("utf-8")).hexdigest()
     desc_filename = desc_hash + ".dsc"
-    doc_idx_filename = desc_hash + '_doc_idx.npy'
-    sample_idx_filename = desc_hash + '_sample_idx.npy'
-    shuffle_idx_filename = desc_hash + '_shuffle_idx.npy'
+    doc_idx_filename = desc_hash + "_doc_idx.npy"
+    sample_idx_filename = desc_hash + "_sample_idx.npy"
+    shuffle_idx_filename = desc_hash + "_shuffle_idx.npy"
 
-    if name == 'train':
+    if name == "train":
         # force to use certain index files
         if args.train_desc_path is not None:
             desc_filename = args.train_desc_path
@@ -578,15 +801,15 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     # duplication, then look in data-cache-path if specified,
     # If nothing is found, use the last path looked in
     build_indices = True
-    prefixes = [os.path.join(os.path.dirname(data_prefix), 'index-cache')]
+    prefixes = [os.path.join(os.path.dirname(data_prefix), "index-cache")]
     if data_cache_path is not None:
         prefixes.append(data_cache_path)
     for prefix in prefixes:
         idx_path = {
-            'desc': os.path.join(prefix, desc_filename),
-            'doc': os.path.join(prefix, doc_idx_filename),
-            'sample': os.path.join(prefix, sample_idx_filename),
-            'shuffle': os.path.join(prefix, shuffle_idx_filename)
+            "desc": os.path.join(prefix, desc_filename),
+            "doc": os.path.join(prefix, doc_idx_filename),
+            "sample": os.path.join(prefix, sample_idx_filename),
+            "shuffle": os.path.join(prefix, shuffle_idx_filename),
         }
         for f in idx_path.values():
             if not os.path.isfile(f):
@@ -595,15 +818,17 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
             # Found our files!
             build_indices = False
             break
-    data_cache_dir = os.path.dirname(idx_path['desc'])
+    data_cache_dir = os.path.dirname(idx_path["desc"])
     data_cache_success = True
 
     # Build the indexed mapping if not exist.
     if build_indices:
-        # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the 
-        # ranks will first create the document files, and then read it. 
+        # Since this function will be called by all the rank in the very beginning. Therefore, we assume that all the
+        # ranks will first create the document files, and then read it.
         # There will not be contension effects going on either
-        print_rank_0(f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}")
+        log.warning(
+            f" > WARNING: could not find index map files, building on rank {torch.distributed.get_rank()}"
+        )
 
         # For the last epoch, decide whether include the entire epoch
         # in the global shuffle or not.
@@ -612,64 +837,80 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
         # not mean anything.
         if num_epochs == 1:
             separate_last_epoch = False
-            print_rank_0(' > only one epoch required, setting '
-                  'separate_last_epoch to False')
+            log.debug(
+                " > only one epoch required, setting " "separate_last_epoch to False"
+            )
 
         else:
             # Get the number of samples for the last epoch
             num_samples_from_epochs_minus_one = (
-                (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
-            last_epoch_num_samples = num_samples - \
-                                     num_samples_from_epochs_minus_one
-            assert last_epoch_num_samples >= 0, \
-                'last epoch number of samples should be non-negative.'
+                (num_epochs - 1) * tokens_per_epoch - 1
+            ) // seq_length
+            last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one
+            assert (
+                last_epoch_num_samples >= 0
+            ), "last epoch number of samples should be non-negative."
             num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
-            assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \
-                'last epoch number of samples exceeded max value.'
+            assert last_epoch_num_samples <= (
+                num_samples_per_epoch + 1
+            ), "last epoch number of samples exceeded max value."
             # If we have less than 80% of the samples for the last epoch,
             # seperate out the epoch and treat it differently.
             # Note: the 80% number is just based on common sense and can
             # be adjusted if needed.
-            separate_last_epoch = (last_epoch_num_samples <
-                                   int(0.80 * num_samples_per_epoch))
+            separate_last_epoch = last_epoch_num_samples < int(
+                0.80 * num_samples_per_epoch
+            )
             if separate_last_epoch:
-                string = ' > last epoch number of samples ({}) is smaller '\
-                         'than 80% of number of samples per epoch ({}), '\
-                         'setting separate_last_epoch to True'
+                string = (
+                    " > last epoch number of samples ({}) is smaller "
+                    "than 80% of number of samples per epoch ({}), "
+                    "setting separate_last_epoch to True"
+                )
             else:
-                string = ' > last epoch number of samples ({}) is larger '\
-                         'than 80% of number of samples per epoch ({}), '\
-                         'setting separate_last_epoch to False'
-            print_rank_0(string.format(last_epoch_num_samples,
-                                num_samples_per_epoch))
-
+                string = (
+                    " > last epoch number of samples ({}) is larger "
+                    "than 80% of number of samples per epoch ({}), "
+                    "setting separate_last_epoch to False"
+                )
+            log.debug(string.format(last_epoch_num_samples, num_samples_per_epoch))
 
         try:
             os.makedirs(data_cache_dir, exist_ok=True)
 
             # description
-            with open(idx_path['desc'], 'wt') as fd:
+            with open(idx_path["desc"], "wt") as fd:
                 fd.write(desc)
 
             # doc-idx.
             start_time = time.time()
-            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
-                                     separate_last_epoch)
-            np.save(idx_path['doc'], doc_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save doc-idx mapping '
-                         '(seconds): {:4f}'.format(time.time() - start_time))
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch)
+            np.save(idx_path["doc"], doc_idx, allow_pickle=True)
+            log.debug(
+                " > elasped time to build and save doc-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
             # sample-idx.
             start_time = time.time()
             # Use C++ implementation for speed.
             # First compile and then import.
             from megatron.data import helpers
+
             assert doc_idx.dtype == np.int32
             assert sizes.dtype == np.int32
-            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
-                                                  num_epochs, tokens_per_epoch, torch.distributed.get_rank()==0)
-            np.save(idx_path['sample'], sample_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save sample-idx mapping '
-                         '(seconds): {:4f}'.format(time.time() - start_time))
+            sample_idx = helpers.build_sample_idx(
+                sizes,
+                doc_idx,
+                seq_length,
+                num_epochs,
+                tokens_per_epoch,
+                torch.distributed.get_rank() == 0,
+            )
+            np.save(idx_path["sample"], sample_idx, allow_pickle=True)
+            log.debug(
+                " > elasped time to build and save sample-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
             # shuffle-idx.
             start_time = time.time()
             # -1 is due to data structure used to retieve the index:
@@ -678,35 +919,46 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
                 num_samples_ = num_samples_from_epochs_minus_one
             else:
                 num_samples_ = sample_idx.shape[0] - 1
-            shuffle_idx = _build_shuffle_idx(num_samples_,
-                                             sample_idx.shape[0] - 1, np_rng)
-            np.save(idx_path['shuffle'], shuffle_idx, allow_pickle=True)
-            print_rank_0(' > elasped time to build and save shuffle-idx mapping'
-                         ' (seconds): {:4f}'.format(time.time() - start_time))
+            shuffle_idx = _build_shuffle_idx(
+                num_samples_, sample_idx.shape[0] - 1, np_rng
+            )
+            np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True)
+            log.debug(
+                " > elasped time to build and save shuffle-idx mapping"
+                " (seconds): {:4f}".format(time.time() - start_time)
+            )
         except OSError:
-            print(f'There was an error trying to create the data cache directory ({data_cache_dir})')
-            print('or a file in it. This defaults to a directory "index-cache" within the directory')
-            print('the data files are in and can be set with the --data-cache-path argument. Please')
-            print('ensure you have write access to this directory or specify one that you do have')
-            print('write access to.')
+            print(
+                f"There was an error trying to create the data cache directory ({data_cache_dir})"
+            )
+            print(
+                'or a file in it. This defaults to a directory "index-cache" within the directory'
+            )
+            print(
+                "the data files are in and can be set with the --data-cache-path argument. Please"
+            )
+            print(
+                "ensure you have write access to this directory or specify one that you do have"
+            )
+            print("write access to.")
             data_cache_success = False
 
     # Load mappings.
     start_time = time.time()
-    print_rank_0(f" > loading doc-idx mapping from {idx_path['doc']}")
-    doc_idx = np.load(idx_path['doc'], allow_pickle=True, mmap_mode='r')
+    log.debug(f" > loading doc-idx mapping from {idx_path['doc']}")
+    doc_idx = np.load(idx_path["doc"], allow_pickle=True, mmap_mode="r")
 
-    print_rank_0(f" > loading sample-idx mapping from {idx_path['sample']}")
-    sample_idx = np.load(idx_path['sample'], allow_pickle=True, mmap_mode='r')
+    log.debug(f" > loading sample-idx mapping from {idx_path['sample']}")
+    sample_idx = np.load(idx_path["sample"], allow_pickle=True, mmap_mode="r")
 
-    print_rank_0(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
-    shuffle_idx = np.load(idx_path['shuffle'], allow_pickle=True, mmap_mode='r')
+    log.debug(f" > loading shuffle-idx mapping from {idx_path['shuffle']}")
+    shuffle_idx = np.load(idx_path["shuffle"], allow_pickle=True, mmap_mode="r")
 
-    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
-        time.time() - start_time))
-    print_rank_0('    total number of samples: {}'.format(
-        sample_idx.shape[0]))
-    print_rank_0('    total number of epochs: {}'.format(num_epochs))
+    log.debug(
+        "    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time)
+    )
+    log.debug("    total number of samples: {}".format(sample_idx.shape[0]))
+    log.debug("    total number of epochs: {}".format(num_epochs))
 
     return doc_idx, sample_idx, shuffle_idx, desc, desc_hash
 
@@ -730,25 +982,26 @@ def _num_epochs(tokens_per_epoch, seq_length, num_samples):
         if ((total_tokens - 1) // seq_length) >= num_samples:
             return num_epochs
 
+
 @dlp.log
 def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
     """Build an array with length = number-of-epochs * number-of-dcuments.
     Each index is mapped to a corresponding document."""
     if not separate_last_epoch or num_epochs == 1:
-        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
+        doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
         doc_idx[:] = documents
         doc_idx = doc_idx.reshape(-1)
         doc_idx = doc_idx.astype(np.int32)
         np_rng.shuffle(doc_idx)
         return doc_idx
 
-    doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False)
+    doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False)
     doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
     return np.concatenate((doc_idx_first, doc_idx_last))
 
+
 @dlp.log
-def _build_sample_idx(sizes, doc_idx, seq_length,
-                      num_epochs, tokens_per_epoch):
+def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
     """Sample index mapping is a 2D array with sizes
     [number-of-samples + 1, 2] where [..., 0] contains
     the index into `doc_idx` and [..., 1] is the
@@ -782,7 +1035,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
             # Note that -1 here is for the same reason we have -1 in
             # `_num_epochs` calculations.
             if remaining_seq_length <= 0:
-                doc_offset += (remaining_seq_length + doc_length - 1)
+                doc_offset += remaining_seq_length + doc_length - 1
                 remaining_seq_length = 0
             else:
                 # Otherwise, start from the begining of the next document.
@@ -795,24 +1048,28 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
 
     return sample_idx
 
+
 @dlp.log
 def _build_shuffle_idx(num_samples, total_size, np_rng):
     """Build the range [0, size) and shuffle."""
-    print_rank_0(' > building shuffle index with split [0, {}) and [{}, {}) '
-          '...'.format(num_samples, num_samples, total_size))
+    log.debug(
+        " > building shuffle index with split [0, {}) and [{}, {}) " "...".format(
+            num_samples, num_samples, total_size
+        )
+    )
 
     dtype_ = np.uint32
     if total_size >= (np.iinfo(np.uint32).max - 1):
         dtype_ = np.int64
 
-    shuffle_idx_first = np.arange(start=0, stop=num_samples,
-                                  step=1, dtype=dtype_)
+    shuffle_idx_first = np.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
     np_rng.shuffle(shuffle_idx_first)
     if num_samples == total_size:
         return shuffle_idx_first
 
-    shuffle_idx_last = np.arange(start=num_samples, stop=total_size,
-                                 step=1, dtype=dtype_)
+    shuffle_idx_last = np.arange(
+        start=num_samples, stop=total_size, step=1, dtype=dtype_
+    )
     np_rng.shuffle(shuffle_idx_last)
 
     return np.concatenate((shuffle_idx_first, shuffle_idx_last))
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index ec2997f7b8d..e2a0c4751ff 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -16,7 +16,6 @@
 from functools import lru_cache
 import os
 
-# import logging
 import shutil
 import struct
 from itertools import accumulate
@@ -29,21 +28,6 @@
 
 log = get_logger(__name__)
 
-
-try:
-    import ezpz as ez
-    RANK = ez.get_rank()
-except Exception:
-    RANK = torch.distributed.get_rank()
-
-# NOTE: [logging]-----------------------------------------------------------
-# - Set logging level to "INFO" on RANK == 0, "CRITICAL" on all other ranks
-log = logging.getLogger(__name__)
-LOG_LEVEL = str(os.environ.get("LOG_LEVEL", "INFO")).upper()
-log.setLevel(LOG_LEVEL) if RANK == 0 else log.setLevel("CRITICAL")
-# --------------------------------------------------------------------------
-
-
 dlp = Profile("DATASET")
 
 

From 63b1901b6127cd71f5b54877fad211714499120f Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Wed, 16 Oct 2024 11:24:09 -0500
Subject: [PATCH 19/31] Update `megatron/data/gpt_dataset.py`

---
 megatron/data/gpt_dataset.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index c412d02b31c..0a3d898d63f 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -168,6 +168,10 @@ def _build_indices():
                 for i in range(self.num_datasets):
                     self.desc += dataset_builders[i].prefix + ","
 
+                log.info(
+                    f"[BuildConcatDataset] Caught {shuffle=} across"
+                    f" {self.num_samples} samples"
+                )
                 self.desc += (
                     f"-{self.num_samples}"
                     + f"-{dataset_builders[0].seq_length}"

From 7ef26bf922262eb80ff58fa481dbc6b2ff84d5ad Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Wed, 16 Oct 2024 11:24:37 -0500
Subject: [PATCH 20/31] Use `time.perf_counter` in
 `megatron/data/blendable_dataset.py`

---
 megatron/data/blendable_dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py
index 979e9a174e7..ab164fdc489 100755
--- a/megatron/data/blendable_dataset.py
+++ b/megatron/data/blendable_dataset.py
@@ -41,7 +41,7 @@ def __init__(self, datasets, weights, size, *,
         # Build indicies.
         @dlp.log
         def _build_indices():
-            start_time = time.time()
+            start_time = time.perf_counter()
             dataset_index = np.zeros(self.size, dtype=np.int64)
             dataset_sample_index = np.zeros(self.size, dtype=np.int64)
 
@@ -77,14 +77,14 @@ def _build_indices():
                 dataset_index, dataset_sample_index = _build_indices()
                 try:
                     log.debug(" > saving index map files")
-                    start_time = time.time()
+                    start_time = time.perf_counter()
                     os.makedirs(os.path.dirname(index_path), exist_ok=True)
                     with open(desc_path, 'wt') as fd:
                         fd.write(desc)
                         np.save(index_path, dataset_index, allow_pickle=True)
                         np.save(sample_index_path, dataset_sample_index,
                                 allow_pickle=True)
-                    log.info(f" > finished saving index map files in {time.time() - start_time} seconds")
+                    log.info(f" > finished saving index map files in {time.perf_counter() - start_time} seconds")
                 except OSError:
                     print(f'There was an error trying to create the data cache directory ({data_cache_path})')
                     print('or a file in it. This is set with the --data-cache-path argument. Please')
@@ -108,14 +108,14 @@ def _build_indices():
             torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group())
             torch.distributed.barrier(group=mpu.get_data_parallel_group())
 
-            start_time = time.time()
+            start_time = time.perf_counter()
             log.info(f'> loading blendable dataset index: {index_path}')
             self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r')
             assert self.dataset_index.size == self.size
             log.info(f'> loading blendable dataset sample index: {sample_index_path}')
             self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r')
             assert self.dataset_sample_index.size == self.size
-            log.info(f'> finished loading in {time.time() - start_time} seconds')
+            log.info(f'> finished loading in {time.perf_counter() - start_time} seconds')
         else:
             self.dataset_index, self.dataset_sample_index = _build_indices()
 

From deb95cd7aa5f677c13b7bee4c0491c04d59d81dd Mon Sep 17 00:00:00 2001
From: Xinyu Lian <lian7@illinois.edu>
Date: Thu, 17 Oct 2024 15:02:26 -0500
Subject: [PATCH 21/31] fix init issue for silently ignoring the deepspeed
 config (#452)

---
 megatron/initialize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/initialize.py b/megatron/initialize.py
index 538f7fc456f..90acf496ee0 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -185,6 +185,7 @@ def setup_deepspeed_random_and_activation_checkpointing(args):
 
     deepspeed.checkpointing.configure(
         mpu,
+        deepspeed_config=args.deepspeed_config,
         partition_activations=args.partition_activations,
         contiguous_checkpointing=args.contigious_checkpointing,
         num_checkpoints=num_layers,

From 68da2dbd0a1a1ab9500cd389a0cfd82d2032ebd6 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Thu, 17 Oct 2024 17:25:03 -0500
Subject: [PATCH 22/31] Update `ALCF/helpers.sh`

---
 ALCF/helpers.sh | 312 ++++++++++++++++++++++++++----------------------
 1 file changed, 167 insertions(+), 145 deletions(-)

diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh
index 5df9a2c7a58..bc7753aa718 100644
--- a/ALCF/helpers.sh
+++ b/ALCF/helpers.sh
@@ -9,7 +9,7 @@
 #     ```bash
 #     $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed
 #     $ cd Megatron-DeepSpeed
-#     $ export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && ezpz_setup
+#     $ export PBS_O_WORKDIR=$(pwd) && source ALCF/helpers.sh && setup
 #     ```
 #
 # and this will, automatically:
@@ -174,48 +174,10 @@ setup_run_cmd() {
     export data_cache_path="${CKPT_DIR}/${DATA_CACHE_PATH}" && mkdir -p "${data_cache_path}"
     printf "\n"
     echo "Using data_cache_path: ${data_cache_path}"
-    TRAIN_SPLIT="${TRAIN_SPLIT:-100}"
-    VAL_SPLIT="${VAL_SPLIT:-0}"
-    TEST_SPLIT="${TEST_SPLIT:-0}"
-    LOG_INTERVAL="${LOG_INTERVAL:-1}"
-    DEFAULTS=(
-        "--split ${TRAIN_SPLIT},${VAL_SPLIT},${TEST_SPLIT}"
-        "--log-interval ${LOG_INTERVAL}"
-        "--no-bias-gelu-fusion"
-        "--no-bias-dropout-fusion"
-        "--no-masked-softmax-fusion"
-        "--no-gradient-accumulation-fusion"
-        "--accumulate-allreduce-grads-in-fp32"
-    )
-    # export DEFAULTS="\
-    #     --split ${TRAIN_SPLIT},${VAL_SPLIT},${TEST_SPLIT} \
-    #     --log-interval ${LOG_INTERVAL} \
-    #     --no-bias-gelu-fusion \
-    #     --no-bias-dropout-fusion \
-    #     --no-masked-softmax-fusion \
-    #     --no-gradient-accumulation-fusion \
-    #     --accumulate-allreduce-grads-in-fp32"
-    # OVERRIDE_CKPT_OPT_PARAM="${OVERRIDE_CKPT_OPT_PARAM:-}"
-    if [[ -z "${OVERRIDE_CKPT_OPT_PARAM:-}" ]]; then
-        DEFAULTS+=("--use-checkpoint-opt_param-scheduler")
-    fi
-    if [[ "${SP}" -gt 1 ]]; then
-        DEFAULTS+=(
-        "--ds-sequence-parallel-size ${SP}"
-        "--force-ds-sequence-parallel"
-        )
-    fi
     ##################################################################
     # WARN: to disable Llama-type architectures, toggle via:
     # `NO_LLAMA=1 bash train_llama_alcf.sh`
     ##################################################################
-    LLAMA_ARGS=""
-    if [[ "${SP}" == 1 ]]; then
-        export LLAMA_ARGS="${LLAMA_ARGS} "
-    else
-        export LLAMA_ARGS=""
-        echo "NOT USING ROTARY EMBEDDINGS! LLAMA_ARGS=${LLAMA_ARGS}"
-    fi
     if [[ -z "${NO_LLAMA:-}" ]]; then
         llama_flags=(
 			"--swiglu"
@@ -230,63 +192,89 @@ setup_run_cmd() {
             "--ffn-hidden-size ${FFN_HIDDEN_SIZE}"
         )
     fi
+    # min_lr=$(python3 -c 'print(f"{2 / (10 ** 5):.8f}")')
+    # "--min-lr ${LR:-${min_lr}}"  # 2e-5
+    lr_flags=(
+        "--lr ${LR:-0.0002}"
+        "--lr-decay-style ${LR_DECAY_STYLE:-cosine}"
+        "--min-lr ${MIN_LR:-"2e-6"}"  # 2e-5
+        "--lr-warmup-fraction ${LR_WARMUP_FRAC:-0.05}"
+    )
+    if [[ -n "${LR_DECAY_ITERS:-}" ]]; then
+        lr_flags+=("--lr-decay-iters ${LR_DECAY_ITERS:-}")
+    fi
 
-    TENSORBARD_ARGS=()
-    if [[ -z "${USE_TENSORBARD:-}" ]]; then
+    tb_flags=()
+    if [[ -z "${NO_TENSORBOARD:-}" ]]; then
         TBDIR="${CKPT_DIR}/tensorboard"
         mkdir -p "${TBDIR}"
-        # --log-timers-to-tensorboard \
-        # --log-optimizer-states-to-tensorboard"
-        # --tensorboard-dir ${TBDIR} \
-        TENSORBARD_ARGS+=(
+        tb_flags+=(
             "--log-timers-to-tensorboard"
             "--log-optimizer-states-to-tensorboard"
             "--tensorboard-dir ${TBDIR}"
         )
     fi
     dfl_fallback="${DATA_FILE_LIST:-${PBS_O_WORKDIR}/ALCF/data-lists/$(get_machine_name)/dolma.txt}"
-    export ADAM_BETA1="${ADAM_BETA1:-0.9}"
-    export ADAM_BETA2="${ADAM_BETA2:-0.95}"
-    export ADAM_EPS="${ADAM_EPS:-0.00001}" # 1 * 10^{-5}
-    export run_cmd=(
-        "${LAUNCHER}"
+
+    train_args=()
+    if [[ -z "${OVERRIDE_CKPT_OPT_PARAM:-}" ]]; then
+        train_args+=("--use-checkpoint-opt_param-scheduler")
+    fi
+    # "--init-method-std ${INIT_METHOD_STD:-0.0006}"
+    # "--weight-decay ${WEIGHT_DECAY:-0.1}"
+    #     --accumulate-allreduce-grads-in-fp32"
+    train_args+=(
+        "${lr_flags[@]}"
+        "${custom_args[@]}"
+        "${llama_flags[@]}"
+        "${DATA_FLAGS}"
+        "${FLASH_ARG}"
+        "${TIMING_STR}"
+        "${TOKENIZER_FLAGS}"
+        "${tb_flags[@]}"
+        "${ds_args[@]}"
+        "${gpt_args[@]}"
         "--${DTYPE}"
-        "${DEFAULTS[@]}"
+        "--shuffle-sample"
+        "--no-bias-gelu-fusion"
+        "--no-bias-dropout-fusion"
+        "--no-masked-softmax-fusion"
+        "--no-gradient-accumulation-fusion"
         "--optimizer ${OPT}"
-        "--save ${CKPT_DIR}"
-        "--load ${CKPT_DIR}"
+        "--tensor-model-parallel-size ${TP}"
+        "--pipeline-model-parallel-size ${PP}"
+        "--max-position-embeddings ${SEQ}"
+        "--micro-batch-size ${MICRO_BATCH}"
+        "--ds-sequence-parallel-size ${SP}"
+        "--global-batch-size ${GLOBAL_BATCH}"
+        "--split ${TRAIN_SPLIT:-950},${VAL_SPLIT:-50},${TEST_SPLIT:-0}"
+        "--timing-log-level ${TIMING_LOG_LEVEL:-1}"
+        "--eval-interval ${EVAL_INTERVAL:-50}"
+        "--eval-iters ${EVAL_ITERS:-40}"
+        "--save-interval ${SAVE_INTERVAL:-50}"
+        "--log-interval ${LOG_INTERVAL:-1}"
+        "--save ${SAVE:-${CKPT_DIR}}"
+        "--load ${LOAD:-${CKPT_DIR}}"
         "--seq-length ${SEQ}"
         "--num-layers ${NLAYERS}"
         "--hidden-size ${HIDDEN}"
         "--train-iters ${TRAIN_ITERS}"
-        "--eval-iters ${EVAL_ITERS}"
         "--distributed-backend ${BE}"
         "--adam-beta1 ${ADAM_BETA1:-0.9}"
         "--adam-beta2 ${ADAM_BETA2:-0.95}"
         "--adam-eps ${ADAM_EPS:-0.00001}"
         "--clip-grad ${CLIP_GRAD:-1.0}"
-        "--weight-decay ${WEIGHT_DECAY:-0.1}"
         "--num-attention-heads ${HEADS}"
-        "--save-interval ${SAVE_INTERVAL}"
-        "--eval-interval ${EVAL_INTERVAL}"
-        "--max-position-embeddings ${SEQ}"
-        "--micro-batch-size ${MICRO_BATCH}"
-        "--tensor-model-parallel-size ${TP}"
-        "--global-batch-size ${GLOBAL_BATCH}"
-        "--pipeline-model-parallel-size ${PP}"
         "--data-cache-path ${data_cache_path}"
         "--data-file-list ${DATA_FILE_LIST:-${dfl_fallback}}"
-        "${TENSORBARD_ARGS[@]}"
-        "${DATA_FLAGS}"
-        "${LR_ARGS}"
-        "${llama_flags[@]}"
-        "${FLASH_ARG}"
-        "${TIMING_STR}"
-        "${TOKENIZER_FLAGS}"
-        "${ds_args[@]}"
-        "${gpt_args[@]}"
-        "${custom_args[@]}"
     )
+    cache_dir="${PBS_O_WORKDIR}/.cache/"
+    mkdir -p "${cache_dir}"
+    targs_cache="${cache_dir}/train_args.txt"
+    for arg in "${train_args[@]}"; do echo "${arg}" >> "${targs_cache}" ; done
+    export TRAIN_ARGS=("$(printf '%s\n' "${train_args[@]}"|sort)")
+    printf "Training Arguments: %s\n" "${TRAIN_ARGS[@]}"
+    export run_cmd=("${LAUNCHER}" "${train_args[@]}")
 }
 
 save_dotenv() {
@@ -430,17 +418,20 @@ setupLauncher() {
     printf " %s" "$(printMagenta "${LAUNCHER}")"
 }
 
-set_lr_args() {
-    LR_ARGS="--lr ${LR} --lr-decay-style cosine"
-    if [[ -n "${LR_DECAY_ITERS:-}" ]]; then
-        LR_ARGS="${LR_ARGS} --lr-decay-iters ${LR_DECAY_ITERS}"
-    fi
-    if [[ -n "${LR_WARMUP_FRAC}" ]]; then
-        LR_ARGS="${LR_ARGS} --lr-warmup-fraction ${LR_WARMUP_FRAC}"
-    fi
-    echo "LR_ARGS: ${LR_ARGS}"
-    export LR_ARGS="${LR_ARGS}"
-}
+# set_lr_args() {
+#     export LR=${LR:-0.0002}                       # LEARNING_RATE
+#     export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP
+#     export LR_DECAY_ITERS=${LR_DECAY_ITERS:-}     # LR DECAY ITERS
+#     LR_ARGS="--lr ${LR} --lr-decay-style cosine"
+#     if [[ -n "${LR_DECAY_ITERS:-}" ]]; then
+#         LR_ARGS="${LR_ARGS} --lr-decay-iters ${LR_DECAY_ITERS}"
+#     fi
+#     if [[ -n "${LR_WARMUP_FRAC}" ]]; then
+#         LR_ARGS="${LR_ARGS} --lr-warmup-fraction ${LR_WARMUP_FRAC}"
+#     fi
+#     echo "LR_ARGS: ${LR_ARGS}"
+#     export LR_ARGS="${LR_ARGS}"
+# }
 
 #########################################################################
 # `get_batch_size_on_polaris`: Identify MICRO_BATCH to use on Polaris.
@@ -495,12 +486,14 @@ _get_num_hosts_from_hostfile() {
 #
 #   [2 tiles] x [6 xpus / tile] = 12 xpus
 #
-# |    nnhosts    |   nhosts  |  GAS  |
-# |:-------------:|:---------:|:-----:|
-# | 64 <= n < inf | [64, inf) |   1   |
-# | 32 <= n < 64  | [32, 64)  |   2   |
-# | 16 <= n < 32  | [16, 32)  |   4   |
-# |  0 <= n < 16  | [0, 16)   |   8   |
+# |     nnhosts     |   nhosts   |  GAS  |
+# |:---------------:|:----------:|:-----:|
+# | 256 <= n < inf  | [256, inf) |   1   |
+# | 128 <= n < 256  | [128, 256) |   2   |
+# |  32 <= n < 128  | [32, 128)  |   4   |
+# |  16 <= n < 32   | [16, 32)   |   8   |
+# |   0 <= n < 16   | [0, 16)    |  16   |
+#
 ###########################################
 get_grad_acc_steps_on_aurora() {
     if [[ "$#" == 0 ]]; then
@@ -508,18 +501,21 @@ get_grad_acc_steps_on_aurora() {
     elif [[ "$#" == 1 ]]; then
         hf="$1"
     else
+        echo "Usage: get_grad_acc_steps_on_aurora"
         echo "Expected exactly 0 or 1 arguments, received: $#"
         exit 1
     fi
     nhosts=$(wc -l <"${hf}")
-    if [[ 64 -le "${nhosts}" ]]; then
+    if [[ "${nhosts}" -gt 256 ]]; then
         gas=1
-    elif [[ 32 -le "${nhosts}" && "${nhosts}" -lt 64 ]]; then
+    elif [[ 128 -le "${nhosts}" && "${nhosts}" -lt 256 ]]; then
         gas=2
-    elif [[ 16 -le "${nhosts}" && "${nhosts}" -lt 32 ]]; then
+    elif [[ 32 -le "${nhosts}" && "${nhosts}" -lt 128 ]]; then
         gas=4
-    else
+    elif [[ 16 -le "${nhosts}" && "${nhosts}" -lt 32 ]]; then
         gas=8
+    else
+        gas=16
     fi
     echo "${gas}"
 }
@@ -580,7 +576,7 @@ setParams() {
         export GRAD_ACC_STEPS="${GRAD_ACC_STEPS:-${gas}}"
         # export GRAD_ACC_STEPS="${GRAD_ACC_STEPS:-$(get_grad_acc_steps_on_aurora "$@)}"
         echo "[setParams] Using GRAD_ACC_STEPS: ${GRAD_ACC_STEPS}"
-        MICRO_BATCH=${MICRO_BATCH:-4} # MICRO_BATCH = 4
+        MICRO_BATCH=${MICRO_BATCH:-1} # MICRO_BATCH = 4
         #### [sam: 08/17/2024] ##########################################
         # Use best set of CCL env vars from Gordon Bell runs on Aurora
         set_ccl_vars_on_aurora
@@ -604,9 +600,7 @@ setParams() {
             echo "Using flash-attn !!"
             FLASH_ARG="--use-flash-attn-builder"
         fi
-        ######################################################################
-    # +--------[Polaris]-----------------------------------+
-    # elif [[ $(hostname) == x3* ]]; then
+    # [Polaris]
     elif [[ "${mn}" == "polaris" || "${mn}" == "sirius" ]]; then
         # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}"
         TP=${TP:-1}               # TP = 2
@@ -625,30 +619,25 @@ setParams() {
         fi
         echo "Setting up AWS NCCL OFI Plugin on Polaris..."
         source "${WORKING_DIR}/ALCF/aws_ofi_nccl_plugin.sh" || exit
-    # +--------[Perlmutter]---------------------------------+
-    # elif [[ $(hostname) == login* || $(hostname) == nid* ]]; then
+    # [Perlmutter]
     elif [[ "${mn}" == login* || "${mn}" == nid* ]]; then
         TP="${TP:-2}"
         export NCCL="${NCCL:-nccl}"
         export BE="${NCCL}"
         export DTYPE="${DTYPE:-bf16}"
-        MICRO_BATCH="${MICRO_BATCH:-8}"
+        MICRO_BATCH="${MICRO_BATCH:-1}"
         if [[ -n "${NO_FLASH_ATTN-}" ]]; then
             echo "Not using flash-attn!!"
         else
             FLASH_ARG="--use-flash-attn-v2"
         fi
     fi
-    # +----------------------------------------------------------------------+
     export TP="${TP}"
     export PP="${PP:-1}"
     export SP="${SP:-1}"
     export FLASH_ARG="${FLASH_ARG}"
     export DTYPE="${DTYPE:-bf16}"
     export OPT="${OPT:-adamw}"
-    # export ADAM_BETA1="${ADAM_BETA1:-0.9}"
-    # export ADAM_BETA2="${ADAM_BETA2:-0.95}"
-    # export ADAM_EPS="${ADAM_EPS:-0.00001}" # 1 * 10^{-5}
     export WEIGHT_DECAY="${WEIGHT_DECAY:-0.1}"
     export HOSTFILE="${HOSTFILE:-${PBS_NODEFILE}}"
     NHOSTS=$(wc -l <"${HOSTFILE}")
@@ -667,18 +656,19 @@ setParams() {
     # +---[Run Settings]------------------------------------------------------+
     export SEQ=${SEQ:-4096}                                                               # SEQ_LEN: 4096
     export ZERO_STAGE=${ZERO_STAGE:-1}                                                    # ZERO OFFLOADING STAGE
-    export MICRO_BATCH=${MICRO_BATCH:-8}                                                  # MICRO BATCH SIZE
+    export MICRO_BATCH=${MICRO_BATCH:-1}                                                  # MICRO BATCH SIZE
     export GRAD_ACC_STEPS=${GRAD_ACC_STEPS:-1}                                            # GRADIENT ACCUMULATION STEPS
-    export EVAL_ITERS="${EVAL_ITERS:-10}"                                                 # NUMBER OF EVAL ITERS TO RUN
-    export EVAL_INTERVAL="${EVAL_INTERVAL:-50000}"                                        # HOW FREQUENTLY TO RUN EVAL
-    export SAVE_INTERVAL=${SAVE_INTERVAL:-50}                                             # HOW FREQUENTLY TO SAVE CKPTS
     export TIMING_LOG_LEVEL="${TIMING_LOG_LEVEL:-1}"                                      # TIMING VERBOSITY IN LOGS
     export ACT_CKPT_NUM_LAYERS="${ACT_CKPT_NUM_LAYERS:-1}"                                # NUM LAYERS TO CHECKPOINT ACTIVATIONS
-    export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-1}                # USE ACTIVATION CHECKPOINTING ?
+    export USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING:-}                 # USE ACTIVATION CHECKPOINTING ?
     export GLOBAL_BATCH_MAX=$((WORLD_SIZE * MICRO_BATCH * GRAD_ACC_STEPS / TP / PP / SP)) # MAX GLOBAL BATCH SIZE
     export GLOBAL_BATCH="${GLOBAL_BATCH:-${GLOBAL_BATCH_MAX}}"                            # WILL USE MAX IF NOT SET IN ENVIRONMENT
-    # export TRAIN_ITER=${TRAIN_ITER:-317892}             # NUMBER OF TRAIN ITERS
-    if [[ -z "${TRAIN_ITERS:-${TRAIN_ITER:-}}" ]]; then
+    if [[ -n "${TRAIN_TOKENS:-}" ]]; then
+        export TRAIN_TOKENS="${TRAIN_TOKENS}"
+        export TRAIN_ITERS=$((TRAIN_TOKENS / SEQ / GLOBAL_BATCH))
+        printf "TRAIN_TOKENS=%s (=%sB tokens)\n" "${TRAIN_TOKENS}" "$((TRAIN_TOKENS / 10 ** 9))"
+        printf "TRAIN_ITERS=%s\n" "${TRAIN_ITERS}"
+    elif [[ -z "${TRAIN_ITERS:-${TRAIN_ITER:-}}" ]]; then
         export TRAIN_TOKENS=${TRAIN_TOKENS:-2000000000000}
         export TRAIN_ITERS=$((TRAIN_TOKENS / SEQ / GLOBAL_BATCH))
         printf "TRAIN_TOKENS=%s (=%sB tokens)\n" "${TRAIN_TOKENS}" "$((TRAIN_TOKENS / 10 ** 9))"
@@ -694,22 +684,20 @@ setParams() {
     #
     #   For this reason, we only use the default LLAMA_ARGS when SP=0.
     ##########################################################################
+    # # -----[Learning Rate Settings]--------------------------------------------
+    # export LR=${LR:-0.0002}                       # LEARNING_RATE
+    # export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP
+    # export LR_DECAY_ITERS=${LR_DECAY_ITERS:-}     # LR DECAY ITERS
+    # set_lr_args
     # -----[Learning Rate Settings]--------------------------------------------
-    export LR=${LR:-0.0003}                       # LEARNING_RATE
-    export LR_WARMUP_FRAC=${LR_WARMUP_FRAC:-0.05} # LEARNING RATE WARMUP
-    export LR_DECAY_ITERS=${LR_DECAY_ITERS:-}     # LR DECAY ITERS
-    set_lr_args
-    # -----[Learning Rate Settings]--------------------------------------------
+    # # if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then
     # if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then
-    if [[ "${TIMING_LOG_LEVEL:-1}" -gt 1 ]]; then
-        TIMING_STR="\
-            --timing-log-level ${TIMING_LOG_LEVEL}"
-            # --log-timers-to-tensorboard \
-            # --log-optimizer-states-to-tensorboard \
-        # "
-    else
-        TIMING_STR=""
-    fi
+    #     TIMING_STR="\
+    #         --timing-log-level ${TIMING_LOG_LEVEL}"
+    #     # "
+    # else
+    #     TIMING_STR=""
+    # fi
 }
 
 ##############################################
@@ -741,7 +729,7 @@ set_args() {
     # if [[ "${ZERO_STAGE}" == 3 ]]; then
     #     ds_args="--use-mics ${ds_args}"
     # fi
-    if [[ "$USE_ACTIVATION_CHECKPOINTING" == 1 ]]; then
+    if [[ -n "${USE_ACTIVATION_CHECKPOINTING:-}" ]]; then
         echo "!! Caught USE_ACTIVATION_CHECKPOINTING=${USE_ACTIVATION_CHECKPOINTING} !!"
         ds_args+=("--deepspeed-activation-checkpointing")
         # ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
@@ -904,6 +892,7 @@ buildDSconfig() {
     echo "DS_CONFIG: ${DS_CONFIG}"
     printf "ZS: %s, MB: %s, GB: %s, PP: %s, DTYPE: %s" "${ZERO_STAGE}" "${MICRO_BATCH}" "${GLOBAL_BATCH}" "${PP}" "${DTYPE}"
     generateDSconfig "${DS_CONFIG}"
+    cat "${DS_CONFIG}" | jq .
 }
 
 ###############################################################################
@@ -1176,27 +1165,19 @@ generateDSconfig() {
         \"train_batch_size\": $GLOBAL_BATCH,
         \"train_micro_batch_size_per_gpu\": $MICRO_BATCH,
         \"steps_per_print\": 1,
+        \"gradient_clipping\": 1.0,
         \"gradient_accumulation_steps\": $GRAD_ACC_STEPS,
         \"zero_force_ds_cpu_optimizer\": false,
         \"zero_allow_untested_optimizer\": true,
         \"gradient_clipping\": 1.0,
         \"wall_clock_breakdown\": false,"
-    if [[ "${USE_ACTIVATION_CHECKPOINTING}" == 1 ]]; then
-        activation_checkpointing="\
-            \"activation_checkpointing\": {
-            \"partition_activations\": true,
-            \"contiguous_memory_optimization\": true
-            },"
-    fi
-    flops_profiler="\
-        \"flops_profiler\": {
-          \"enabled\": true,
-          \"profile_step\": 2,
-          \"module_depth\": -1,
-          \"top_modules\": 1,
-          \"detailed\": true,
-          \"output_file\": null
-        }"
+    # if [[ "${USE_ACTIVATION_CHECKPOINTING}" == 1 ]]; then
+    #     activation_checkpointing="\
+    #         \"activation_checkpointing\": {
+    #         \"partition_activations\": true,
+    #         \"contiguous_memory_optimization\": true
+    #         },"
+    # fi
     if [[ $DTYPE == "bf16" ]]; then
         dtype="\
             \"communication_data_type\": \"bf16\",
@@ -1228,7 +1209,7 @@ generateDSconfig() {
     else
         dtype="\"communication_data_type\": \"fp32\","
     fi
-    if [[ "${OPT:-adamw}" == "ds.adamw" ]]; then
+    if [[ "${OPT:-}" == "ds.adamw" ]]; then
         optimizer="\
             \"optimizer\": {
                 \"type\": \"AdamW\",
@@ -1238,6 +1219,24 @@ generateDSconfig() {
                 \"beta2\": 0.95,
                 \"eps\": 1e-5,
                 \"weight_decay\": 1e-1
+            },
+        },"
+    elif [[ "${OPT:-}" == "ds.onebitlamb" ]]; then
+        optimizer="\
+            \"optimizer\": {
+                \"type\": \"OneBitLamb\",
+                \"params\": {
+                    \"lr\": 11e-3,
+                    \"max_coeff\": 0.3,
+                    \"min_coeff\": 0.01,
+                    \"freeze_step\": 1000,
+                    \"cuda_aware\": false,
+                    \"comm_backend_name\": \"${BE}\",
+                    \"coeff_beta\": 0.9,
+                    \"factor_max\": 4.0,
+                    \"factor_min\": 0.5,
+                    \"factor_threshold\": 0.1
+                }
             },"
     else
         optimizer=""
@@ -1267,7 +1266,7 @@ generateDSconfig() {
             },"
     # elif [[ $ZERO_STAGE == 2 ]]; then
     elif [[ "${ZERO_STAGE}" == 2 || "${ZERO_STAGE}" == 1 ]]; then
-        if [[ -z "${CPU_OPTIMIZER:-}" ]]; then
+        if [[ -n "${CPU_OPTIMIZER:-}" ]]; then
             echo "!!!! CAUGHT CPU_OPTIMIZER !!!!"
             zero="\
                 \"zero_optimization\": {
@@ -1304,6 +1303,15 @@ generateDSconfig() {
     else
         echo 'Please add the correct config set!!!'
     fi
+    flops_profiler="\
+        \"flops_profiler\": {
+          \"enabled\": true,
+          \"profile_step\": 2,
+          \"module_depth\": -1,
+          \"top_modules\": 1,
+          \"detailed\": true,
+          \"output_file\": null
+        }"
     cat <<EOT >"$1"
 {
 $common
@@ -1388,6 +1396,7 @@ printWhite() {
 reset_env() {
     custom_vars=(
         NO_FLASH_ATTN
+        USE_FLASH_ATTN
         TP
         PP
         SP
@@ -1415,7 +1424,6 @@ reset_env() {
         TRAIN_TOKENS
         TRAIN_ITERS
         MODEL_TYPE
-        LLAMA_ARGS
         LR
         LR_WARMUP_FRAC
         LR_DECAY_ITERS
@@ -1447,10 +1455,24 @@ reset_env() {
         data_cache_path
         DEFAULTS
     )
+    # LLAMA_ARGS
     printf "Unsetting custom vars: %s\n" "${custom_vars[*]}"
     unset "${custom_vars[@]}"
 }
 
+convert_ckpt_to_universal() {
+    if [[ "$#" -ne 1 ]]; then
+        echo "Usage: convert_ckpt_to_universal ckpt_dir"
+        echo "Expected one argument (ckpt_dir), received: $#"
+        exit 1
+    fi
+    ckptdir=$1
+    gs=$(cat "${ckptdir}/latest_checkpointed_iteration.txt")
+    src="${ckptdir}/global_step${gs}"
+    dst="${ckptdir}/global_step${gs}_universal"
+    convert_script="${PBS_O_WORKDIR}/deps/DeepSpeed/checkpoint/ds_to_universal.py"
+    python3 "${convert_script}" --input_folder "${src}" --output_folder "${dst}"
+}
 
 ###########################
 # call helpers_main()

From 6acc370a41440098031227e881bd9f1f23aa369e Mon Sep 17 00:00:00 2001
From: ranzhejiang <zhejiang.ran@intel.com>
Date: Fri, 18 Oct 2024 18:31:05 +0800
Subject: [PATCH 23/31] fix moe tflops (#445)

---
 megatron/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 15160b16447..68d06f0dae4 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -281,6 +281,7 @@ def throughput_calculator(model, args, iteration_time, total_iterations):
     num_layers = args.num_layers
     vocab_size = args.padded_vocab_size
     gqa = args.num_attention_heads // args.num_key_value_heads
+    num_experts_routed_to = args.topk
     ffn_multiplier = 3 if args.swiglu else 2
     macs_per_flops = 2
 
@@ -294,7 +295,7 @@ def throughput_calculator(model, args, iteration_time, total_iterations):
 
     pre_and_post_mha_gemm_macs = batch_size * num_layers * (1 + (2 // gqa) + 1) * (hidden_size**2) * seq_len
     mha_bgemm_macs = batch_size * num_layers * 2 * head_dim * num_attention_heads * (seq_len**2)
-    ffn_gemm_macs = batch_size * num_layers * ffn_multiplier * ffn_hidden_size * hidden_size * seq_len
+    ffn_gemm_macs = batch_size * num_layers * ffn_multiplier * ffn_hidden_size * hidden_size * seq_len * num_experts_routed_to
     logit_lmhead_gemm_macs = batch_size * vocab_size * hidden_size * seq_len
 
     fwd_macs = pre_and_post_mha_gemm_macs + mha_bgemm_macs + ffn_gemm_macs + logit_lmhead_gemm_macs

From 9e015cc164512e463559a2a6846679676aa3b17f Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Fri, 18 Oct 2024 10:00:26 -0500
Subject: [PATCH 24/31] Remove duplicate `gradient_accumulation_steps` in DS
 config

---
 ALCF/helpers.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh
index bc7753aa718..ade65a8d09b 100644
--- a/ALCF/helpers.sh
+++ b/ALCF/helpers.sh
@@ -1165,7 +1165,6 @@ generateDSconfig() {
         \"train_batch_size\": $GLOBAL_BATCH,
         \"train_micro_batch_size_per_gpu\": $MICRO_BATCH,
         \"steps_per_print\": 1,
-        \"gradient_clipping\": 1.0,
         \"gradient_accumulation_steps\": $GRAD_ACC_STEPS,
         \"zero_force_ds_cpu_optimizer\": false,
         \"zero_allow_untested_optimizer\": true,

From 58dc2d7c86d8c5b562871450ab7e2377d9db022f Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Mon, 21 Oct 2024 09:04:19 -0500
Subject: [PATCH 25/31] Update default EVAL args

---
 ALCF/helpers.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh
index ade65a8d09b..cedac99f96d 100644
--- a/ALCF/helpers.sh
+++ b/ALCF/helpers.sh
@@ -247,10 +247,10 @@ setup_run_cmd() {
         "--micro-batch-size ${MICRO_BATCH}"
         "--ds-sequence-parallel-size ${SP}"
         "--global-batch-size ${GLOBAL_BATCH}"
-        "--split ${TRAIN_SPLIT:-950},${VAL_SPLIT:-50},${TEST_SPLIT:-0}"
+        "--split ${TRAIN_SPLIT:-990},${VAL_SPLIT:-10},${TEST_SPLIT:-0}"
         "--timing-log-level ${TIMING_LOG_LEVEL:-1}"
-        "--eval-interval ${EVAL_INTERVAL:-50}"
-        "--eval-iters ${EVAL_ITERS:-40}"
+        "--eval-interval ${EVAL_INTERVAL:-100}"
+        "--eval-iters ${EVAL_ITERS:-20}"
         "--save-interval ${SAVE_INTERVAL:-50}"
         "--log-interval ${LOG_INTERVAL:-1}"
         "--save ${SAVE:-${CKPT_DIR}}"

From 277d308db336a9c5000c098f265d8d193f4b8bae Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Mon, 21 Oct 2024 09:04:37 -0500
Subject: [PATCH 26/31] Catch eval metrics in `megatron/training.py`

---
 megatron/training.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 8ffac6cb9c2..d39f21c1289 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -26,6 +26,7 @@
 import torch.distributed as tdist
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 
+import wandb
 from megatron import (
     get_args,
     get_current_global_batch_size,
@@ -316,7 +317,7 @@ def pretrain(
     config = core_transformer_config_from_args(args)
     if args.do_valid:
         prefix = f"iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from validation set"
-        evaluate_and_print_results(
+        _ = evaluate_and_print_results(
             prefix,
             forward_step_func,
             valid_data_iterator,
@@ -329,7 +330,7 @@ def pretrain(
         )
     if args.do_test:
         prefix = f"iteration {iteration} on {args.eval_iters * args.global_batch_size}-sample draw from test set"
-        evaluate_and_print_results(
+        _ = evaluate_and_print_results(
             prefix,
             forward_step_func,
             test_data_iterator,
@@ -924,7 +925,6 @@ def train_step(
         # Empty unused memory.
         if args.empty_unused_memory_level >= 2 and accelerator is not None:
             accelerator.empty_cache()
-
         # XXX: [saforem2]: ----------------------------------------------------
         # Is `num_zeros_in_grad` worth calculating (/ implementing) ??
         # the `Megatron`-specific implementation is at:
@@ -1406,6 +1406,16 @@ def evaluate_and_print_results(
         config,
         verbose,
     )
+    key = "test" if test else "val"
+    if wandb is not None and wandb.run is not None:
+        wandb.log({
+            f"{key}/iteration": iteration,
+            **{f"{key}/{k}": v for k, v in total_loss_dict.items()},
+            **{
+                f"{key}/ppl_{k}": math.exp(min(20, v.item()))
+                for k, v in total_loss_dict.items()
+            },
+        })
     string = " validation loss at {} | ".format(prefix)
     for key in total_loss_dict:
         string += f"{key} value={total_loss_dict[key].item():.6f}"
@@ -1451,6 +1461,7 @@ def evaluate_and_print_results(
     log.info("-" * length)
     log.info(string)
     log.info("-" * length)
+    return total_loss_dict
 
 
 def cyclic_iter(iter):

From af4cba12b810e420b8c0039e384d0b06c8842861 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Mon, 21 Oct 2024 09:05:06 -0500
Subject: [PATCH 27/31] Save git branch to env in `train_aGPT_7B.sh`

---
 train_aGPT_7B.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/train_aGPT_7B.sh b/train_aGPT_7B.sh
index 286740fc89f..1350ea0f2a4 100644
--- a/train_aGPT_7B.sh
+++ b/train_aGPT_7B.sh
@@ -1,4 +1,7 @@
 #!/bin/bash --login
+#PBS -q lustre_scaling
+#PBS -A Aurora_Deployment
+#PBS -j oe
 
 #####################################
 # AuroraGPT-7B
@@ -10,6 +13,8 @@
 # 1. Navigate into `$PBS_O_WORKDIR`
 cd "${PBS_O_WORKDIR}" || exit
 HERE=$(python3 -c 'import os; print(os.getcwd())') && export HERE
+GIT_BRANCH=$(git branch --show-current) && export GIT_BRANCH
+
 
 # 2. source `ALCF/helpers.sh`
 source "${HERE}/ALCF/helpers.sh" || exit

From 8a8472c7bd83a3c9bc13f27b54c099ab1cde98b6 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 21 Oct 2024 19:35:57 +0000
Subject: [PATCH 28/31] fixed print out bug

---
 megatron/data/gpt_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 38df5562675..9ff27032772 100755
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -154,8 +154,8 @@ def _build_indices_blended():
                         dataset_index, dataset_sample_index,
                         weights, self.num_datasets, self.num_samples,
                         torch.distributed.get_rank() == 0)
-                    log.debug('> elapsed time for building blendable dataset indices for corpus {self.dataset_builders[0].corpus}: '
-                             '{:.2f} (sec)'.format(time.time() - start_time))
+                    log.debug(f"> elapsed time for building blendable dataset indices for corpus {self.dataset_builders[0].corpus}: "
+                             "{:.2f} (sec)".format(time.time() - start_time))
                     return dataset_index, dataset_sample_index
 
 

From 6cb727dde1ef92eedfafc24882cdf51a73a7203b Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Mon, 21 Oct 2024 15:43:58 -0500
Subject: [PATCH 29/31] Fix `args.shuffle` in `megatron/data/gpt_dataset.py`

---
 megatron/data/gpt_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 8872d709c7b..d09f08d63a4 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -189,7 +189,7 @@ def _build_indices_concat():
                     self.desc += dataset_builders[i].prefix + ","
 
                 log.info(
-                    f"[BuildConcatDataset] Caught {shuffle=} across"
+                    f"[BuildConcatDataset] Caught {args.shuffle_sample_in_corpus=} across"
                     f" {self.num_samples} samples"
                 )
                 self.desc += (

From 5d10179125c884516a70e76d0a1b36f59b4e1e4c Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Wed, 23 Oct 2024 22:36:50 -0500
Subject: [PATCH 30/31] Update `--{shuffle,blend}-sample-in-corpus` arg in
 `ALCF/helpers.sh`

---
 ALCF/helpers.sh | 98 +++++++++++++++++++++++++------------------------
 1 file changed, 50 insertions(+), 48 deletions(-)

diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh
index cedac99f96d..603bc88fdb2 100644
--- a/ALCF/helpers.sh
+++ b/ALCF/helpers.sh
@@ -141,7 +141,6 @@ setup() {
     setup_run_cmd "$@" || exit
 }
 
-
 #####################################################
 # setup_run_cmd
 #
@@ -180,24 +179,24 @@ setup_run_cmd() {
     ##################################################################
     if [[ -z "${NO_LLAMA:-}" ]]; then
         llama_flags=(
-			"--swiglu"
+            "--swiglu"
             "--hidden-dropout 0"
             "--attention-dropout 0"
-			"--normalization rmsnorm"
-			"--disable-bias-linear"
+            "--normalization rmsnorm"
+            "--disable-bias-linear"
             "--no-query-key-layer-scaling"
-			"--use-rotary-position-embeddings"
-			"--untie-embeddings-and-output-weights"
+            "--use-rotary-position-embeddings"
+            "--untie-embeddings-and-output-weights"
             "--num-key-value-heads ${NUM_KV_HEAD}"
             "--ffn-hidden-size ${FFN_HIDDEN_SIZE}"
         )
     fi
     # min_lr=$(python3 -c 'print(f"{2 / (10 ** 5):.8f}")')
     # "--min-lr ${LR:-${min_lr}}"  # 2e-5
+    # "--min-lr ${MIN_LR:-"2e-6"}"  # 2e-5
     lr_flags=(
         "--lr ${LR:-0.0002}"
         "--lr-decay-style ${LR_DECAY_STYLE:-cosine}"
-        "--min-lr ${MIN_LR:-"2e-6"}"  # 2e-5
         "--lr-warmup-fraction ${LR_WARMUP_FRAC:-0.05}"
     )
     if [[ -n "${LR_DECAY_ITERS:-}" ]]; then
@@ -221,8 +220,7 @@ setup_run_cmd() {
         train_args+=("--use-checkpoint-opt_param-scheduler")
     fi
     # "--init-method-std ${INIT_METHOD_STD:-0.0006}"
-    # "--weight-decay ${WEIGHT_DECAY:-0.1}"
-    #     --accumulate-allreduce-grads-in-fp32"
+    # "--shuffle-sample"
     train_args+=(
         "${lr_flags[@]}"
         "${custom_args[@]}"
@@ -235,44 +233,48 @@ setup_run_cmd() {
         "${ds_args[@]}"
         "${gpt_args[@]}"
         "--${DTYPE}"
-        "--shuffle-sample"
+        "--shuffle-sample-in-corpus"
+        "--blend-sample-in-corpus"
+        "--accumulate-allreduce-grads-in-fp32"
         "--no-bias-gelu-fusion"
         "--no-bias-dropout-fusion"
         "--no-masked-softmax-fusion"
         "--no-gradient-accumulation-fusion"
-        "--optimizer ${OPT}"
-        "--tensor-model-parallel-size ${TP}"
-        "--pipeline-model-parallel-size ${PP}"
-        "--max-position-embeddings ${SEQ}"
-        "--micro-batch-size ${MICRO_BATCH}"
-        "--ds-sequence-parallel-size ${SP}"
-        "--global-batch-size ${GLOBAL_BATCH}"
-        "--split ${TRAIN_SPLIT:-990},${VAL_SPLIT:-10},${TEST_SPLIT:-0}"
-        "--timing-log-level ${TIMING_LOG_LEVEL:-1}"
-        "--eval-interval ${EVAL_INTERVAL:-100}"
-        "--eval-iters ${EVAL_ITERS:-20}"
-        "--save-interval ${SAVE_INTERVAL:-50}"
-        "--log-interval ${LOG_INTERVAL:-1}"
-        "--save ${SAVE:-${CKPT_DIR}}"
-        "--load ${LOAD:-${CKPT_DIR}}"
-        "--seq-length ${SEQ}"
-        "--num-layers ${NLAYERS}"
-        "--hidden-size ${HIDDEN}"
-        "--train-iters ${TRAIN_ITERS}"
-        "--distributed-backend ${BE}"
-        "--adam-beta1 ${ADAM_BETA1:-0.9}"
-        "--adam-beta2 ${ADAM_BETA2:-0.95}"
-        "--adam-eps ${ADAM_EPS:-0.00001}"
-        "--clip-grad ${CLIP_GRAD:-1.0}"
-        "--num-attention-heads ${HEADS}"
-        "--data-cache-path ${data_cache_path}"
-        "--data-file-list ${DATA_FILE_LIST:-${dfl_fallback}}"
+        "--optimizer=${OPT}"
+        "--tensor-model-parallel-size=${TP}"
+        "--pipeline-model-parallel-size=${PP}"
+        "--max-position-embeddings=${SEQ}"
+        "--micro-batch-size=${MICRO_BATCH}"
+        "--ds-sequence-parallel-size=${SP}"
+        "--global-batch-size=${GLOBAL_BATCH}"
+        "--split=${TRAIN_SPLIT:-990},${VAL_SPLIT:-10},${TEST_SPLIT:-0}"
+        "--timing-log-level=${TIMING_LOG_LEVEL:-1}"
+        "--eval-interval=${EVAL_INTERVAL:-100}"
+        "--eval-iters=${EVAL_ITERS:-20}"
+        "--save-interval=${SAVE_INTERVAL:-50}"
+        "--log-interval=${LOG_INTERVAL:-1}"
+        "--save=${SAVE:-${CKPT_DIR}}"
+        "--load=${LOAD:-${CKPT_DIR}}"
+        "--seq-length=${SEQ}"
+        "--num-layers=${NLAYERS}"
+        "--hidden-size=${HIDDEN}"
+        "--train-iters=${TRAIN_ITERS}"
+        "--distributed-backend=${BE}"
+        "--weight-decay=${WEIGHT_DECAY:-0.1}"
+        "--adam-beta1=${ADAM_BETA1:-0.9}"
+        "--adam-beta2=${ADAM_BETA2:-0.95}"
+        "--adam-eps=${ADAM_EPS:-0.00001}"
+        "--clip-grad=${CLIP_GRAD:-1.0}"
+        "--num-attention-heads=${HEADS}"
+        "--data-cache-path=${data_cache_path}"
+        "--data-file-list=${DATA_FILE_LIST:-${dfl_fallback}}"
     )
+    # "--adam-eps ${ADAM_EPS:-0.00001}"
     cache_dir="${PBS_O_WORKDIR}/.cache/"
     mkdir -p "${cache_dir}"
     targs_cache="${cache_dir}/train_args.txt"
-    for arg in "${train_args[@]}"; do echo "${arg}" >> "${targs_cache}" ; done
-    export TRAIN_ARGS=("$(printf '%s\n' "${train_args[@]}"|sort)")
+    for arg in "${train_args[@]}"; do echo "${arg}" >>"${targs_cache}"; done
+    export TRAIN_ARGS=("$(printf '%s\n' "${train_args[@]}" | sort)")
     printf "Training Arguments: %s\n" "${TRAIN_ARGS[@]}"
     export run_cmd=("${LAUNCHER}" "${train_args[@]}")
 }
@@ -506,7 +508,7 @@ get_grad_acc_steps_on_aurora() {
         exit 1
     fi
     nhosts=$(wc -l <"${hf}")
-    if [[ "${nhosts}" -gt 256 ]]; then
+    if [[ "${nhosts}" -ge 256 ]]; then
         gas=1
     elif [[ 128 -le "${nhosts}" && "${nhosts}" -lt 256 ]]; then
         gas=2
@@ -567,7 +569,7 @@ setParams() {
     mn=$(get_machine_name)
     if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then
         TP=${TP:-1} # TP = 1
-        export SAVE_INTERVAL="${SAVE_INTERVAL:-20}"
+        export SAVE_INTERVAL="${SAVE_INTERVAL:-50}"
         export CCL=${CCL:-ccl}      # CCL
         export BE="${CCL}"          # COMMUNICATION BACKEND = CCL
         export DTYPE=${DTYPE:-bf16} # DTYPE: bf16
@@ -845,7 +847,7 @@ get_output_prefix() {
     pre="${pre}_sp${SP}_pp${PP}_tp${TP}_${DTYPE}_opt${OPT}"
     pre="${pre}_lr${LR}_lwf${LR_WARMUP_FRAC}"
     if [[ -n "${TOKENIZER_TYPE:-}" ]]; then
-        _tok=$(echo "${TOKENIZER_TYPE}" | sed 's/Tokenizer//g')  # noqa
+        _tok=$(echo "${TOKENIZER_TYPE}" | sed 's/Tokenizer//g') # noqa
         pre="${pre}_tok${_tok}"
     fi
     if [[ -n "${LR_DECAY_ITERS}" ]]; then
@@ -1111,7 +1113,7 @@ setData() { # ------------------------[dfl: abbrv. for DATA_FILE_LIST]
 }
 
 generateDSconfig_new() {
-    cat <<EOT > "${CONFIG_JSON}"
+    cat <<EOT >"${CONFIG_JSON}"
     {
     "train_batch_size" : $GLOBAL_BATCH,
     "train_micro_batch_size_per_gpu": $MICRO_BATCH,
@@ -1164,11 +1166,11 @@ generateDSconfig() {
     common="\
         \"train_batch_size\": $GLOBAL_BATCH,
         \"train_micro_batch_size_per_gpu\": $MICRO_BATCH,
+        \"gradient_clipping\": 1.0,
         \"steps_per_print\": 1,
         \"gradient_accumulation_steps\": $GRAD_ACC_STEPS,
         \"zero_force_ds_cpu_optimizer\": false,
         \"zero_allow_untested_optimizer\": true,
-        \"gradient_clipping\": 1.0,
         \"wall_clock_breakdown\": false,"
     # if [[ "${USE_ACTIVATION_CHECKPOINTING}" == 1 ]]; then
     #     activation_checkpointing="\
@@ -1178,8 +1180,8 @@ generateDSconfig() {
     #         },"
     # fi
     if [[ $DTYPE == "bf16" ]]; then
+        # \"communication_data_type\": \"bf16\",
         dtype="\
-            \"communication_data_type\": \"bf16\",
             \"fp16\": {
               \"enabled\": false,
               \"loss_scale\": 0,
@@ -1214,9 +1216,9 @@ generateDSconfig() {
                 \"type\": \"AdamW\",
                 \"params\": {
                 \"lr\": ${LR},
-                \"beta1\": 0.9,
-                \"beta2\": 0.95,
-                \"eps\": 1e-5,
+                \"beta1\": ${ADAM_BETA1},
+                \"beta2\": ${ADAM_BETA2},
+                \"eps\": ${ADAM_EPS},
                 \"weight_decay\": 1e-1
             },
         },"

From 160d6a69717689226798ae8e378ea4c7b2866ad4 Mon Sep 17 00:00:00 2001
From: Sam Foreman <saforem2@gmail.com>
Date: Wed, 30 Oct 2024 21:25:53 -0500
Subject: [PATCH 31/31] fix: `GRAD_ACC_STEPS` when `NHOSTS == 256`

---
 ALCF/helpers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh
index 603bc88fdb2..fac2f4d8ecb 100644
--- a/ALCF/helpers.sh
+++ b/ALCF/helpers.sh
@@ -508,7 +508,7 @@ get_grad_acc_steps_on_aurora() {
         exit 1
     fi
     nhosts=$(wc -l <"${hf}")
-    if [[ "${nhosts}" -ge 256 ]]; then
+    if [[ "${nhosts}" -gt 256 ]]; then
         gas=1
     elif [[ 128 -le "${nhosts}" && "${nhosts}" -lt 256 ]]; then
         gas=2