From 6822cc0b95055133e7c58ab52eb67b0d4af57bb6 Mon Sep 17 00:00:00 2001 From: Jun Ru Anderson Date: Thu, 20 Aug 2020 17:25:23 -0700 Subject: [PATCH 1/5] make tests deterministic and add TODO to fix state dict --- fairscale/optim/adam.py | 4 ++++ tests/optim/test_adam.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/fairscale/optim/adam.py b/fairscale/optim/adam.py index 905eec5c4..2fd82d371 100644 --- a/fairscale/optim/adam.py +++ b/fairscale/optim/adam.py @@ -147,6 +147,10 @@ def mixed_precision(self) -> bool: def load_state_dict(self, state_dict: Dict[str, Any]) -> None: super().load_state_dict(state_dict) + + # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # mixed-precision and memory-efficient mixed-precision. Eventually + # we want to fix this so that precision is not lost for group in self.param_groups: for p in group["params"]: self.state[p]["exp_avg"] = self.state[p]["exp_avg"].type(self.optim_type) diff --git a/tests/optim/test_adam.py b/tests/optim/test_adam.py index 54ff8a207..6ea154881 100644 --- a/tests/optim/test_adam.py +++ b/tests/optim/test_adam.py @@ -20,6 +20,12 @@ skip_if_no_adam = pytest.mark.skipif(not imported_adam, reason="Fairscale Adam not available") +@pytest.fixture(autouse=True) +def set_torch_seed(): + torch.manual_seed(1) + yield + + def make_full_precision_params(): weight = torch.randn(2, 1).cuda().requires_grad_() bias = torch.randn(2).cuda().requires_grad_() @@ -79,6 +85,12 @@ def fn_base(optimizer, weight, bias, input): for _i in range(5): optimizer.step(fn) optimizer_c.step(fn_c) + + # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # mixed-precision and memory-efficient mixed-precision, resulting + # in a loss of precision, so we don't expect the parameters to + # remain the exact same. This therefore does NOT pass for all + # torch manual seeds (weight - weight_c).to("cpu").detach().apply_(assert_almost_zero) (bias - bias_c).to("cpu").detach().apply_(assert_almost_zero) From 04186df885a01c769edfbf19cd99a3920d9a3418 Mon Sep 17 00:00:00 2001 From: Jun Ru Anderson Date: Thu, 20 Aug 2020 17:30:10 -0700 Subject: [PATCH 2/5] precommit --- fairscale/optim/adam.py | 2 +- tests/optim/test_adam.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fairscale/optim/adam.py b/fairscale/optim/adam.py index 2fd82d371..91936c05e 100644 --- a/fairscale/optim/adam.py +++ b/fairscale/optim/adam.py @@ -148,7 +148,7 @@ def mixed_precision(self) -> bool: def load_state_dict(self, state_dict: Dict[str, Any]) -> None: super().load_state_dict(state_dict) - # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # TODO: Optimizer state gets cast to FP16 and back to FP32 for # mixed-precision and memory-efficient mixed-precision. Eventually # we want to fix this so that precision is not lost for group in self.param_groups: diff --git a/tests/optim/test_adam.py b/tests/optim/test_adam.py index 6ea154881..74d6f57ea 100644 --- a/tests/optim/test_adam.py +++ b/tests/optim/test_adam.py @@ -86,7 +86,7 @@ def fn_base(optimizer, weight, bias, input): optimizer.step(fn) optimizer_c.step(fn_c) - # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # TODO: Optimizer state gets cast to FP16 and back to FP32 for # mixed-precision and memory-efficient mixed-precision, resulting # in a loss of precision, so we don't expect the parameters to # remain the exact same. This therefore does NOT pass for all From 935ea966d16032e629229c56dcbe567d50d080b6 Mon Sep 17 00:00:00 2001 From: Jun Ru Anderson Date: Fri, 21 Aug 2020 09:21:13 -0700 Subject: [PATCH 3/5] so benchmarks don't error if using torch.optim.Adam --- benchmarks/transformer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/transformer.py b/benchmarks/transformer.py index 962871668..7c2cbfce8 100644 --- a/benchmarks/transformer.py +++ b/benchmarks/transformer.py @@ -135,7 +135,11 @@ def make_model(device, ntokens): criterion = nn.CrossEntropyLoss() lr = 0.01 # learning rate - optimizer = Adam(p.parameters(), lr=lr, precision=Precision.MIXED_PRECISION) + + try: + optimizer = Adam(p.parameters(), lr=lr, precision=Precision.MIXED_PRECISION) + except NameError: + optimizer = Adam(p.parameters(), lr=lr) return p, criterion, optimizer From 0be67415e522f27ecf85b22e07fe1f298d83f89d Mon Sep 17 00:00:00 2001 From: Jun Ru Anderson Date: Fri, 21 Aug 2020 09:37:02 -0700 Subject: [PATCH 4/5] switch to requiring state_dict to remain the exact same and xfailing the cases that are known to be broken --- tests/optim/test_adam.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/optim/test_adam.py b/tests/optim/test_adam.py index 74d6f57ea..2beda2c59 100644 --- a/tests/optim/test_adam.py +++ b/tests/optim/test_adam.py @@ -86,13 +86,8 @@ def fn_base(optimizer, weight, bias, input): optimizer.step(fn) optimizer_c.step(fn_c) - # TODO: Optimizer state gets cast to FP16 and back to FP32 for - # mixed-precision and memory-efficient mixed-precision, resulting - # in a loss of precision, so we don't expect the parameters to - # remain the exact same. This therefore does NOT pass for all - # torch manual seeds - (weight - weight_c).to("cpu").detach().apply_(assert_almost_zero) - (bias - bias_c).to("cpu").detach().apply_(assert_almost_zero) + assert torch.equal(weight, weight_c) + assert torch.equal(bias, bias_c) def assert_almost_zero(x): @@ -242,7 +237,12 @@ def test_state_dict_full_precision(): @skip_if_no_cuda @skip_if_no_adam +@pytest.mark.xfail def test_state_dict_mixed_precision(): + # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # mixed-precision and memory-efficient mixed-precision, resulting + # in a potential loss of precision. Thus, as training proceeds, we don't + # necessarily expect the parameters to remain the exact same. weight, bias, input = make_half_precision_params() optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MIXED_PRECISION) @@ -251,7 +251,12 @@ def test_state_dict_mixed_precision(): @skip_if_no_cuda @skip_if_no_adam +@pytest.mark.xfail def test_state_dict_memory_efficient(): + # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # mixed-precision and memory-efficient mixed-precision, resulting + # in a potential loss of precision. Thus, as training proceeds, we don't + # necessarily expect the parameters to remain the exact same. weight, bias, input = make_half_precision_params() optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MEMORY_EFFICIENT_MIXED_PRECISION) From 116c36bcfd12a3a5322c44dbd137477391b417bd Mon Sep 17 00:00:00 2001 From: Jun Ru Anderson Date: Fri, 21 Aug 2020 10:10:16 -0700 Subject: [PATCH 5/5] more fiddling --- fairscale/optim/adam.py | 2 +- tests/optim/test_adam.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fairscale/optim/adam.py b/fairscale/optim/adam.py index 91936c05e..531ea7dce 100644 --- a/fairscale/optim/adam.py +++ b/fairscale/optim/adam.py @@ -150,7 +150,7 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None: # TODO: Optimizer state gets cast to FP16 and back to FP32 for # mixed-precision and memory-efficient mixed-precision. Eventually - # we want to fix this so that precision is not lost + # we want to fix this, as some precision may be lost for group in self.param_groups: for p in group["params"]: self.state[p]["exp_avg"] = self.state[p]["exp_avg"].type(self.optim_type) diff --git a/tests/optim/test_adam.py b/tests/optim/test_adam.py index 2beda2c59..cd047bc35 100644 --- a/tests/optim/test_adam.py +++ b/tests/optim/test_adam.py @@ -81,6 +81,19 @@ def fn_base(optimizer, weight, bias, input): # Load state dict state_dict = deepcopy(optimizer.state_dict()) optimizer_c.load_state_dict(state_dict) + + for group, group_c in zip(optimizer.param_groups, optimizer_c.param_groups): + for p, p_c in zip(group["params"], group_c["params"]): + assert torch.equal(optimizer.state[p]["exp_avg"], optimizer_c.state[p_c]["exp_avg"]) + assert torch.equal(optimizer.state[p]["exp_avg_sq"], optimizer_c.state[p_c]["exp_avg_sq"]) + + if optimizer.fp32_param_groups: + # When using mixed precision, fp32_param_groups are made from FP16 params rather than + # copied via state_dict, introducing differences between the original optimizer and + # the copy. Because this test requires that they be the exact same, we copy the + # fp32 params from the original optimizer to the copy + optimizer_c.fp32_param_groups = deepcopy(optimizer.fp32_param_groups) + # Run both optimizations in parallel for _i in range(5): optimizer.step(fn)