From 9de6d6c886bf8d6c36678e17e41aa677257c63d6 Mon Sep 17 00:00:00 2001 From: Josh <1113285+jsrozner@users.noreply.github.com> Date: Thu, 4 Mar 2021 15:02:07 -0800 Subject: [PATCH 1/5] Update optimization.py Fix documentation to reflect optimal settings for Adafactor --- src/transformers/optimization.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index e9fee7fda4ac..a5b13315fb44 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -410,9 +410,8 @@ class Adafactor(Optimizer): Example:: - Adafactor(model.parameters(), lr=1e-3, relative_step=False, warmup_init=True) + Adafactor(model.parameters(), relative_step=True, warmup_init=True) # lr is by default None - - Alternatively, relative_step with warmup_init can be used. - Training without LR warmup or clip threshold is not recommended. Additional optimizer operations like gradient clipping should not be used alongside Adafactor. From 49bfa7e8d212c3a22264e6d9037776e0a3052b39 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 22 Mar 2021 14:02:52 -0700 Subject: [PATCH 2/5] update and expand on the recommendations --- src/transformers/optimization.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index a5b13315fb44..64e34b54ccb6 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -402,18 +402,23 @@ class Adafactor(Optimizer): This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested. - Recommended T5 finetuning settings: + Recommended T5 finetuning settings (https://discuss.huggingface.co/t/t5-finetuning-tips/684/3): - - Scheduled LR warm-up to fixed LR - - disable relative updates - - use clip threshold: https://arxiv.org/abs/2004.14546 + - Training without LR warmup or clip_threshold is not recommended. + * use scheduled LR warm-up to fixed LR + * use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235) + - Disable relative updates + - Use scale_parameter=False + - Additional optimizer operations like gradient clipping should not be used alongside Adafactor Example:: - Adafactor(model.parameters(), relative_step=True, warmup_init=True) # lr is by default None + Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3) + + Others reported the following combination to work well:: + + Adafactor(model.parameters(), scale_parameter=False, relative_step=True, warmup_init=True, lr=None) - - Training without LR warmup or clip threshold is not recommended. Additional optimizer operations like - gradient clipping should not be used alongside Adafactor. Usage:: @@ -446,9 +451,9 @@ def __init__( warmup_init=False, ): if lr is not None and relative_step: - raise ValueError("Cannot combine manual lr and relative_step options") + raise ValueError("Cannot combine manual lr and relative_step=True options") if warmup_init and not relative_step: - raise ValueError("warmup_init requires relative_step=True") + raise ValueError("warmup_init=True requires relative_step=True") defaults = dict( lr=lr, From 41e25280d8dc29e305c014cdefb5b3821ec58ce1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 22 Mar 2021 14:03:20 -0700 Subject: [PATCH 3/5] style --- src/transformers/optimization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 64e34b54ccb6..6ff586788c24 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -405,6 +405,7 @@ class Adafactor(Optimizer): Recommended T5 finetuning settings (https://discuss.huggingface.co/t/t5-finetuning-tips/684/3): - Training without LR warmup or clip_threshold is not recommended. + * use scheduled LR warm-up to fixed LR * use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235) - Disable relative updates From 1469ee5b89bfd9672b34aa625f2a96f3a72b8a92 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 22 Mar 2021 14:14:32 -0700 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 6ff586788c24..d6165bf348de 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -452,9 +452,9 @@ def __init__( warmup_init=False, ): if lr is not None and relative_step: - raise ValueError("Cannot combine manual lr and relative_step=True options") + raise ValueError("Cannot combine manual `lr` and `relative_step=True` options") if warmup_init and not relative_step: - raise ValueError("warmup_init=True requires relative_step=True") + raise ValueError("`warmup_init=True` requires `relative_step=True`") defaults = dict( lr=lr, From 19270a6b01646fb9e83289df708e1a823c2c126f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 23 Mar 2021 10:14:02 -0700 Subject: [PATCH 5/5] flip scale_parameter to True for the 2nd recommendatoin --- src/transformers/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 6ff586788c24..667bf1a400d2 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -418,7 +418,7 @@ class Adafactor(Optimizer): Others reported the following combination to work well:: - Adafactor(model.parameters(), scale_parameter=False, relative_step=True, warmup_init=True, lr=None) + Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None) Usage::