From f326b0b19185c7c80241a477ba0d58f12db23077 Mon Sep 17 00:00:00 2001 From: Parth Chadha Date: Mon, 28 Apr 2025 12:43:44 -0700 Subject: [PATCH 1/3] fix: fixes #264 where tied weights check didn't work on fsdp1 Signed-off-by: Parth Chadha --- nemo_rl/models/policy/dtensor_policy_worker.py | 6 +++--- nemo_rl/models/policy/fsdp1_policy_worker.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index 39e0dd4974..7ce9cbdadc 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -142,6 +142,7 @@ def __init__( device_map="cpu", # load weights onto CPU initially torch_dtype=torch.float32, # use full precision in sft until https://github.com/NVIDIA/nemo-rl/issues/13 is fixed ) + self.num_tied_weights = len(find_tied_parameters(self.model)) self.tokenizer = tokenizer # ------------------------------------------------ @@ -256,15 +257,14 @@ def train( mbs: Optional[int] = None, ) -> Dict[str, Any]: """Train the policy on a batch of data with a given loss function.""" - num_tied_weights = len(find_tied_parameters(self.model)) skip_tie_check = os.environ.get("NRL_SKIP_TIED_WEIGHT_CHECK") if ( - num_tied_weights != 0 + self.num_tied_weights != 0 and self.cfg["dtensor_cfg"]["tensor_parallel_size"] > 1 and not skip_tie_check ): raise ValueError( - f"Using dtensor policy with tp size {self.cfg['dtensor_cfg']['tensor_parallel_size']} for model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={num_tied_weights}) is not supported (https://github.com/NVIDIA/nemo-rl/issues/227). Please use dtensor policy with tensor parallel == 1 instead." + f"Using dtensor policy with tp size {self.cfg['dtensor_cfg']['tensor_parallel_size']} for model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA/nemo-rl/issues/227). Please use dtensor policy with tensor parallel == 1 instead." ) if gbs is None: diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py index b49c2748cf..686be6a53a 100644 --- a/nemo_rl/models/policy/fsdp1_policy_worker.py +++ b/nemo_rl/models/policy/fsdp1_policy_worker.py @@ -94,6 +94,7 @@ def __init__( device_map="cpu", # load weights onto CPU initially torch_dtype=torch.float32, # use full precision in sft until https://github.com/NVIDIA/nemo-rl/issues/13 is fixed ) + self.num_tied_weights = len(find_tied_parameters(self.model)) if init_reference_model: self.reference_model = AutoModelForCausalLM.from_pretrained( @@ -229,11 +230,10 @@ def train( ) -> Dict[str, Any]: """Train the policy on a batch of data with a given loss function.""" # Check if the model has tied weights - num_tied_weights = len(find_tied_parameters(self.model)) skip_tie_check = os.environ.get("NRL_SKIP_TIED_WEIGHT_CHECK") - if num_tied_weights != 0 and not skip_tie_check: + if self.num_tied_weights != 0 and not skip_tie_check: raise ValueError( - f"Using FSP1 with a model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={num_tied_weights}) is not supported (https://github.com/NVIDIA/nemo-rl/issues/227). Please use dtensor policy with tensor parallel == 1 instead." + f"Using FSP1 with a model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA/nemo-rl/issues/227). Please use dtensor policy with tensor parallel == 1 instead." ) if gbs is None: From 33b106cf40c9ef1e3bec972404f549b986a7d490 Mon Sep 17 00:00:00 2001 From: Parth Chadha Date: Mon, 28 Apr 2025 12:54:41 -0700 Subject: [PATCH 2/3] Update nemo_rl/models/policy/dtensor_policy_worker.py Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Signed-off-by: Parth Chadha Signed-off-by: Parth Chadha --- nemo_rl/models/policy/dtensor_policy_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index 7ce9cbdadc..27dfb4ac77 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -142,6 +142,7 @@ def __init__( device_map="cpu", # load weights onto CPU initially torch_dtype=torch.float32, # use full precision in sft until https://github.com/NVIDIA/nemo-rl/issues/13 is fixed ) + # caching since this property is not always preserved after FSDP self.num_tied_weights = len(find_tied_parameters(self.model)) self.tokenizer = tokenizer From ed945a89cead344dd4c729736acfb673c6d1f397 Mon Sep 17 00:00:00 2001 From: Parth Chadha Date: Mon, 28 Apr 2025 12:54:47 -0700 Subject: [PATCH 3/3] Update nemo_rl/models/policy/fsdp1_policy_worker.py Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Signed-off-by: Parth Chadha Signed-off-by: Parth Chadha --- nemo_rl/models/policy/fsdp1_policy_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py index 686be6a53a..b25e930b6f 100644 --- a/nemo_rl/models/policy/fsdp1_policy_worker.py +++ b/nemo_rl/models/policy/fsdp1_policy_worker.py @@ -94,6 +94,7 @@ def __init__( device_map="cpu", # load weights onto CPU initially torch_dtype=torch.float32, # use full precision in sft until https://github.com/NVIDIA/nemo-rl/issues/13 is fixed ) + # caching since this property is not always preserved after FSDP self.num_tied_weights = len(find_tied_parameters(self.model)) if init_reference_model: