From efa7d4a28046124d024c71b3789cc4e335003bf8 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 17 Apr 2026 10:25:07 -0700 Subject: [PATCH 1/5] =?UTF-8?q?The=204=20failing=20tests=20in=20tests/unit?= =?UTF-8?q?=5Ftests/distributed/test=5Fmesh=5Futils.py=20were=20over-speci?= =?UTF-8?q?fying=20implementation=20details.=20The=20implementation=20of?= =?UTF-8?q?=20get=5Ffsdp=5Fdp=5Fmesh=20=20=20(nemo=5Fautomodel/components/?= =?UTF-8?q?distributed/mesh=5Futils.py:414)=20correctly=20probes=20axis=20?= =?UTF-8?q?sizes=20via=20device=5Fmesh[cp=5Fname].size()=20and=20device=5F?= =?UTF-8?q?mesh[dp=5Freplicate=5Fname].size()=20before=20slicing,=20which?= =?UTF-8?q?=20makes=202=E2=80=933=20=5F=5Fgetitem=5F=5F=20calls=20=20=20?= =?UTF-8?q?=20total.=20The=20tests=20asserted=20assert=5Fcalled=5Fonce=5Fw?= =?UTF-8?q?ith(...)=20/=20assert=5Fnot=5Fcalled(),=20which=20conflicts=20w?= =?UTF-8?q?ith=20those=20size=20probes.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix in tests/unit_tests/distributed/test_mesh_utils.py: - Lines 364, 379, 393: assert_called_once_with → assert_any_call. The existing result._key and result._mesh is mesh assertions already verify the real guarantees (correct slice key + shared root mesh). - Lines 414–415: replaced __getitem__.assert_not_called() with a filter that only forbids tuple-key direct slices (the actual guarantee), allowing bare-name size probes. Signed-off-by: Alexandros Koumparoulis --- tests/unit_tests/distributed/test_mesh_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/unit_tests/distributed/test_mesh_utils.py b/tests/unit_tests/distributed/test_mesh_utils.py index d2890db1d5..dfc6abf79a 100644 --- a/tests/unit_tests/distributed/test_mesh_utils.py +++ b/tests/unit_tests/distributed/test_mesh_utils.py @@ -361,7 +361,7 @@ def test_no_replicate_no_cp_returns_dp_shard_slice(self): result = get_fsdp_dp_mesh(mesh) - mesh.__getitem__.assert_called_once_with("dp_shard") + mesh.__getitem__.assert_any_call("dp_shard") assert result._key == "dp_shard" # Returned mesh is a slice of the original, not a freshly built one. assert result._mesh is mesh @@ -376,7 +376,7 @@ def test_replicate_only_returns_replicate_shard_tuple_slice(self): result = get_fsdp_dp_mesh(mesh) - mesh.__getitem__.assert_called_once_with(("dp_replicate", "dp_shard")) + mesh.__getitem__.assert_any_call(("dp_replicate", "dp_shard")) assert result._key == ("dp_replicate", "dp_shard") assert result._mesh is mesh @@ -390,7 +390,7 @@ def test_cp_only_returns_shard_cp_tuple_slice(self): result = get_fsdp_dp_mesh(mesh) - mesh.__getitem__.assert_called_once_with(("dp_shard", "cp")) + mesh.__getitem__.assert_any_call(("dp_shard", "cp")) assert result._key == ("dp_shard", "cp") assert result._mesh is mesh @@ -411,8 +411,10 @@ def test_replicate_and_cp_falls_back_to_get_submesh(self): mock_get_submesh.assert_called_once_with(mesh, ("dp_replicate", "dp_shard_cp")) assert result is submesh_sentinel - # __getitem__ must NOT have been called directly. - mesh.__getitem__.assert_not_called() + # The returned mesh must come from get_submesh, not from a direct + # __getitem__ slice. Size probes via __getitem__ are allowed. + direct_slice_calls = [c for c in mesh.__getitem__.call_args_list if isinstance(c.args[0], tuple)] + assert direct_slice_calls == [] # ------------------------------------------------------------------ # Branch 5 – native dims not available → get_submesh fallback From 02868894b3c9b9d140cc0b45822506c0fb133eb9 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 17 Apr 2026 10:29:55 -0700 Subject: [PATCH 2/5] patch get_fsdp_dp_mesh instead of get_submesh Signed-off-by: Alexandros Koumparoulis --- .../distributed/test_parallelization_strategies.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/distributed/test_parallelization_strategies.py b/tests/unit_tests/distributed/test_parallelization_strategies.py index 09b2fd13d3..91c4aecd15 100644 --- a/tests/unit_tests/distributed/test_parallelization_strategies.py +++ b/tests/unit_tests/distributed/test_parallelization_strategies.py @@ -583,12 +583,12 @@ def mesh_tp2(self): return mesh, dp_mesh, tp_mesh def _mock_env(self, monkeypatch, dp_mesh_sentinel=None): - # Mock get_submesh to return the known dp_mesh sentinel from the fixture, + # Mock get_fsdp_dp_mesh to return the known dp_mesh sentinel from the fixture, # so we can assert the correct mesh is forwarded to apply_fsdp. if dp_mesh_sentinel is not None: monkeypatch.setattr( - "nemo_automodel.components.distributed.parallelizer.get_submesh", - lambda mesh, names: dp_mesh_sentinel, + "nemo_automodel.components.distributed.parallelizer.get_fsdp_dp_mesh", + lambda mesh, *a, **kw: dp_mesh_sentinel, ) fully_shard_mock = MagicMock(side_effect=lambda model, **kwargs: model) From 7e8807d906798140f9bd59084da5f481b31643f7 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 17 Apr 2026 10:32:01 -0700 Subject: [PATCH 3/5] update Signed-off-by: Alexandros Koumparoulis --- .../distributed/test_parallelizer.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/unit_tests/distributed/test_parallelizer.py b/tests/unit_tests/distributed/test_parallelizer.py index 86b09ae976..bf47118aa3 100644 --- a/tests/unit_tests/distributed/test_parallelizer.py +++ b/tests/unit_tests/distributed/test_parallelizer.py @@ -727,6 +727,7 @@ def test_apply_fsdp_sharding_module_list( self, mock_fully_shard, mock_module_list, mock_mesh, mock_mp_policy, mock_offload_policy ): """Test apply_fsdp2_sharding_recursively with a ModuleList.""" + # Set up mock return values - add FSDP2 prefetch methods that fully_shard normally provides def mock_shard(x, **kwargs): x.set_modules_to_forward_prefetch = MagicMock() @@ -761,6 +762,7 @@ def test_apply_fsdp_sharding_module_list_without_offload_policy( self, mock_fully_shard, mock_module_list, mock_mesh, mock_mp_policy ): """Test apply_fsdp2_sharding_recursively with a ModuleList and no offload policy.""" + # Set up mock return values - add FSDP2 prefetch methods that fully_shard normally provides def mock_shard(x, **kwargs): x.set_modules_to_forward_prefetch = MagicMock() @@ -969,11 +971,15 @@ class via type(...) that preserves __module__ and __qualname__ from the original # Simulate _get_mixin_wrapped_class: create a *new* class object that copies # __module__ and __qualname__ from the original (same qualname, different object) - WrappedCls = type(original_cls.__name__, (nn.Module,), { - "forward": lambda self, x: x, - "__module__": original_cls.__module__, - "__qualname__": original_cls.__qualname__, - }) + WrappedCls = type( + original_cls.__name__, + (nn.Module,), + { + "forward": lambda self, x: x, + "__module__": original_cls.__module__, + "__qualname__": original_cls.__qualname__, + }, + ) assert WrappedCls is not original_cls assert _get_class_qualname(WrappedCls) == _get_class_qualname(original_cls) @@ -1276,8 +1282,8 @@ def forward(self, x): lambda *a, **kw: None, ) monkeypatch.setattr( - "nemo_automodel.components.distributed.parallelizer.get_submesh", - lambda mesh, names: MagicMock(), + "nemo_automodel.components.distributed.parallelizer.get_fsdp_dp_mesh", + lambda mesh, *a, **kw: MagicMock(), ) def _run_parallelize(self, model, activation_checkpointing=True): @@ -1313,17 +1319,13 @@ def test_use_cache_disabled_without_kv_sharing(self): def test_use_cache_preserved_flat_config(self): """KV-sharing detected through a flat config (no text_config nesting).""" - model = _make_model_for_ac( - use_cache=True, num_kv_shared_layers=10, text_config_nested=False - ) + model = _make_model_for_ac(use_cache=True, num_kv_shared_layers=10, text_config_nested=False) self._run_parallelize(model) assert model.config.use_cache is True def test_use_cache_disabled_flat_config_no_sharing(self): """Flat config without KV sharing still disables cache.""" - model = _make_model_for_ac( - use_cache=True, num_kv_shared_layers=0, text_config_nested=False - ) + model = _make_model_for_ac(use_cache=True, num_kv_shared_layers=0, text_config_nested=False) self._run_parallelize(model) assert model.config.use_cache is False From 50e5dfb1c97d3a6a3db9cbc4383ff936464b9146 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 17 Apr 2026 12:27:39 -0700 Subject: [PATCH 4/5] update path Signed-off-by: Alexandros Koumparoulis --- .../L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh index 06076d79e3..a4a4b6879a 100644 --- a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh +++ b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_Llama_custom.sh @@ -24,11 +24,11 @@ MODEL_PATH=/home/TestData/HF_HOME/hub/models--meta-llama--Llama-3.2-1B/snapshots # override with a smaller model meta-llama/Llama-3.2-1B for testing TRANSFORMERS_OFFLINE=1 python -m torch.distributed.run --nproc_per_node=2 --nnodes=1 -m coverage run \ nemo_automodel/recipes/llm/benchmark.py \ - --config examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml \ + --config examples/llm_benchmark/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml \ --model.pretrained_model_name_or_path=${MODEL_PATH} \ --model.num_hidden_layers=2 \ --distributed.tp_size=2 \ --distributed.pp_size=1 \ --distributed_config.sequence_parallel=False \ --benchmark.warmup_steps=2 \ - --step_scheduler.max_steps=4 \ No newline at end of file + --step_scheduler.max_steps=4 From c921559d19ff18d4342b454f0c039c1fb72dda8e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 17 Apr 2026 13:53:36 -0700 Subject: [PATCH 5/5] fix Signed-off-by: Alexandros Koumparoulis --- .../L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh index 6bb3626048..560bbfd68f 100644 --- a/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh +++ b/tests/functional_tests/hf_transformer_finetune/L2_HF_Transformer_PEFT_Benchmark_qwen2_custom.sh @@ -24,7 +24,7 @@ MODEL_PATH=/home/TestData/HF_HOME/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed # override with a smaller model Qwen/Qwen2.5-1.5B for testing TRANSFORMERS_OFFLINE=1 python -m torch.distributed.run --master-port=29513 --nproc_per_node=2 --nnodes=1 -m coverage run \ nemo_automodel/recipes/llm/benchmark.py \ - --config examples/llm_finetune/qwen/custom_qwen2_5_32b_peft_benchmark.yaml \ + --config examples/llm_benchmark/qwen/custom_qwen2_5_32b_peft_benchmark.yaml \ --model.pretrained_model_name_or_path=${MODEL_PATH} \ --model.num_hidden_layers=2 \ --distributed.tp_size=2 \ @@ -34,4 +34,4 @@ nemo_automodel/recipes/llm/benchmark.py \ --step_scheduler.max_steps=4 \ --step_scheduler.global_batch_size=2 \ --step_scheduler.local_batch_size=1 \ - --dataset.seq_len=256 \ No newline at end of file + --dataset.seq_len=256