From efa7d4a28046124d024c71b3789cc4e335003bf8 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 17 Apr 2026 10:25:07 -0700
Subject: [PATCH 1/5] =?UTF-8?q?The=204=20failing=20tests=20in=20tests/unit?=
 =?UTF-8?q?=5Ftests/distributed/test=5Fmesh=5Futils.py=20were=20over-speci?=
 =?UTF-8?q?fying=20implementation=20details.=20The=20implementation=20of?=
 =?UTF-8?q?=20get=5Ffsdp=5Fdp=5Fmesh=20=20=20(nemo=5Fautomodel/components/?=
 =?UTF-8?q?distributed/mesh=5Futils.py:414)=20correctly=20probes=20axis=20?=
 =?UTF-8?q?sizes=20via=20device=5Fmesh[cp=5Fname].size()=20and=20device=5F?=
 =?UTF-8?q?mesh[dp=5Freplicate=5Fname].size()=20before=20slicing,=20which?=
 =?UTF-8?q?=20makes=202=E2=80=933=20=5F=5Fgetitem=5F=5F=20calls=20=20=20?=
 =?UTF-8?q?=20total.=20The=20tests=20asserted=20assert=5Fcalled=5Fonce=5Fw?=
 =?UTF-8?q?ith(...)=20/=20assert=5Fnot=5Fcalled(),=20which=20conflicts=20w?=
 =?UTF-8?q?ith=20those=20size=20probes.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix in tests/unit_tests/distributed/test_mesh_utils.py:
  - Lines 364, 379, 393: assert_called_once_with → assert_any_call. The existing result._key and result._mesh is mesh assertions already verify the real guarantees (correct slice key + shared root mesh).
  - Lines 414–415: replaced __getitem__.assert_not_called() with a filter that only forbids tuple-key direct slices (the actual guarantee), allowing bare-name size probes.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 tests/unit_tests/distributed/test_mesh_utils.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/unit_tests/distributed/test_mesh_utils.py b/tests/unit_tests/distributed/test_mesh_utils.py
index d2890db1d5..dfc6abf79a 100644
--- a/tests/unit_tests/distributed/test_mesh_utils.py
+++ b/tests/unit_tests/distributed/test_mesh_utils.py
@@ -361,7 +361,7 @@ def test_no_replicate_no_cp_returns_dp_shard_slice(self):
 
         result = get_fsdp_dp_mesh(mesh)
 
-        mesh.__getitem__.assert_called_once_with("dp_shard")
+        mesh.__getitem__.assert_any_call("dp_shard")
         assert result._key == "dp_shard"
         # Returned mesh is a slice of the original, not a freshly built one.
         assert result._mesh is mesh
@@ -376,7 +376,7 @@ def test_replicate_only_returns_replicate_shard_tuple_slice(self):
 
         result = get_fsdp_dp_mesh(mesh)
 
-        mesh.__getitem__.assert_called_once_with(("dp_replicate", "dp_shard"))
+        mesh.__getitem__.assert_any_call(("dp_replicate", "dp_shard"))
         assert result._key == ("dp_replicate", "dp_shard")
         assert result._mesh is mesh
 
@@ -390,7 +390,7 @@ def test_cp_only_returns_shard_cp_tuple_slice(self):
 
         result = get_fsdp_dp_mesh(mesh)
 
-        mesh.__getitem__.assert_called_once_with(("dp_shard", "cp"))
+        mesh.__getitem__.assert_any_call(("dp_shard", "cp"))
         assert result._key == ("dp_shard", "cp")
         assert result._mesh is mesh
 
@@ -411,8 +411,10 @@ def test_replicate_and_cp_falls_back_to_get_submesh(self):
 
         mock_get_submesh.assert_called_once_with(mesh, ("dp_replicate", "dp_shard_cp"))
         assert result is submesh_sentinel
-        # __getitem__ must NOT have been called directly.
-        mesh.__getitem__.assert_not_called()
+        # The returned mesh must come from get_submesh, not from a direct
+        # __getitem__ slice. Size probes via __getitem__ are allowed.
+        direct_slice_calls = [c for c in mesh.__getitem__.call_args_list if isinstance(c.args[0], tuple)]
+        assert direct_slice_calls == []
 
     # ------------------------------------------------------------------
     # Branch 5 – native dims not available → get_submesh fallback

From 02868894b3c9b9d140cc0b45822506c0fb133eb9 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 17 Apr 2026 10:29:55 -0700
Subject: [PATCH 2/5] patch get_fsdp_dp_mesh instead of get_submesh

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../distributed/test_parallelization_strategies.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit_tests/distributed/test_parallelization_strategies.py b/tests/unit_tests/distributed/test_parallelization_strategies.py
index 09b2fd13d3..91c4aecd15 100644
--- a/tests/unit_tests/distributed/test_parallelization_strategies.py
+++ b/tests/unit_tests/distributed/test_parallelization_strategies.py
@@ -583,12 +583,12 @@ def mesh_tp2(self):
         return mesh, dp_mesh, tp_mesh
 
     def _mock_env(self, monkeypatch, dp_mesh_sentinel=None):
-        # Mock get_submesh to return the known dp_mesh sentinel from the fixture,
+        # Mock get_fsdp_dp_mesh to return the known dp_mesh sentinel from the fixture,
         # so we can assert the correct mesh is forwarded to apply_fsdp.
         if dp_mesh_sentinel is not None:
             monkeypatch.setattr(
-                "nemo_automodel.components.distributed.parallelizer.get_submesh",
-                lambda mesh, names: dp_mesh_sentinel,
+                "nemo_automodel.components.distributed.parallelizer.get_fsdp_dp_mesh",
+                lambda mesh, *a, **kw: dp_mesh_sentinel,
             )
 
         fully_shard_mock = MagicMock(side_effect=lambda model, **kwargs: model)

From 7e8807d906798140f9bd59084da5f481b31643f7 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 17 Apr 2026 10:32:01 -0700
Subject: [PATCH 3/5] update

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../distributed/test_parallelizer.py          | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/tests/unit_tests/distributed/test_parallelizer.py b/tests/unit_tests/distributed/test_parallelizer.py
index 86b09ae976..bf47118aa3 100644
--- a/tests/unit_tests/distributed/test_parallelizer.py
+++ b/tests/unit_tests/distributed/test_parallelizer.py
@@ -727,6 +727,7 @@ def test_apply_fsdp_sharding_module_list(
         self, mock_fully_shard, mock_module_list, mock_mesh, mock_mp_policy, mock_offload_policy
     ):
         """Test apply_fsdp2_sharding_recursively with a ModuleList."""
+
         # Set up mock return values - add FSDP2 prefetch methods that fully_shard normally provides
         def mock_shard(x, **kwargs):
             x.set_modules_to_forward_prefetch = MagicMock()
@@ -761,6 +762,7 @@ def test_apply_fsdp_sharding_module_list_without_offload_policy(
         self, mock_fully_shard, mock_module_list, mock_mesh, mock_mp_policy
     ):
         """Test apply_fsdp2_sharding_recursively with a ModuleList and no offload policy."""
+
         # Set up mock return values - add FSDP2 prefetch methods that fully_shard normally provides
         def mock_shard(x, **kwargs):
             x.set_modules_to_forward_prefetch = MagicMock()
@@ -969,11 +971,15 @@ class via type(...) that preserves __module__ and __qualname__ from the original
 
         # Simulate _get_mixin_wrapped_class: create a *new* class object that copies
         # __module__ and __qualname__ from the original (same qualname, different object)
-        WrappedCls = type(original_cls.__name__, (nn.Module,), {
-            "forward": lambda self, x: x,
-            "__module__": original_cls.__module__,
-            "__qualname__": original_cls.__qualname__,
-        })
+        WrappedCls = type(
+            original_cls.__name__,
+            (nn.Module,),
+            {
+                "forward": lambda self, x: x,
+                "__module__": original_cls.__module__,
+                "__qualname__": original_cls.__qualname__,
+            },
+        )
         assert WrappedCls is not original_cls
         assert _get_class_qualname(WrappedCls) == _get_class_qualname(original_cls)
 
@@ -1276,8 +1282,8 @@ def forward(self, x):
             lambda *a, **kw: None,
         )
         monkeypatch.setattr(
-            "nemo_automodel.components.distributed.parallelizer.get_submesh",
-            lambda mesh, names: MagicMock(),
+            "nemo_automodel.components.distributed.parallelizer.get_fsdp_dp_mesh",
+            lambda mesh, *a, **kw: MagicMock(),
         )
 
     def _run_parallelize(self, model, activation_checkpointing=True):
@@ -1313,17 +1319,13 @@ def test_use_cache_disabled_without_kv_sharing(self):
 
     def test_use_cache_preserved_flat_config(self):
         """KV-sharing detected through a flat config (no text_config nesting)."""
-        model = _make_model_for_ac(
-            use_cache=True, num_kv_shared_layers=10, text_config_nested=False
-        )
+        model = _make_model_for_ac(use_cache=True, num_kv_shared_layers=10, text_config_nested=False)
         self._run_parallelize(model)
         assert model.config.use_cache is True
 
     def test_use_cache_disabled_flat_config_no_sharing(self):
         """Flat config without KV sharing still disables cache."""
-        model = _make_model_for_ac(
-            use_cache=True, num_kv_shared_layers=0, text_config_nested=False
-        )
+        model = _make_model_for_ac(use_cache=True, num_kv_shared_layers=0, text_config_nested=False)
         self._run_parallelize(model)
         assert model.config.use_cache is False
 

From 50e5dfb1c97d3a6a3db9cbc4383ff936464b9146 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 17 Apr 2026 12:27:39 -0700
Subject: [PATCH 4/5] update path

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh
index 06076d79e3..a4a4b6879a 100644
--- a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh
+++ b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh
@@ -24,11 +24,11 @@ MODEL_PATH=/home/TestData/HF_HOME/hub/models--meta-llama--Llama-3.2-1B/snapshots
 # override with a smaller model meta-llama/Llama-3.2-1B for testing
 TRANSFORMERS_OFFLINE=1 python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run \
 nemo_automodel/recipes/llm/benchmark.py \
-    --config examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml \
+    --config examples/llm_benchmark/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml \
     --model.pretrained_model_name_or_path=${MODEL_PATH} \
     --model.num_hidden_layers=2 \
     --distributed.tp_size=2 \
     --distributed.pp_size=1 \
     --distributed_config.sequence_parallel=False \
     --benchmark.warmup_steps=2 \
-    --step_scheduler.max_steps=4
\ No newline at end of file
+    --step_scheduler.max_steps=4

From c921559d19ff18d4342b454f0c039c1fb72dda8e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 17 Apr 2026 13:53:36 -0700
Subject: [PATCH 5/5] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh
index 6bb3626048..560bbfd68f 100644
--- a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh
+++ b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh
@@ -24,7 +24,7 @@ MODEL_PATH=/home/TestData/HF_HOME/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed
 # override with a smaller model Qwen/Qwen2.5-1.5B for testing
 TRANSFORMERS_OFFLINE=1 python -m torch.distributed.run --master-port=29513 --nproc_per_node=2 --nnodes=1 -m coverage run \
 nemo_automodel/recipes/llm/benchmark.py \
-    --config examples/llm_finetune/qwen/custom_qwen2_5_32b_peft_benchmark.yaml \
+    --config examples/llm_benchmark/qwen/custom_qwen2_5_32b_peft_benchmark.yaml \
     --model.pretrained_model_name_or_path=${MODEL_PATH} \
     --model.num_hidden_layers=2 \
     --distributed.tp_size=2 \
@@ -34,4 +34,4 @@ nemo_automodel/recipes/llm/benchmark.py \
     --step_scheduler.max_steps=4 \
     --step_scheduler.global_batch_size=2 \
     --step_scheduler.local_batch_size=1 \
-    --dataset.seq_len=256
\ No newline at end of file
+    --dataset.seq_len=256