From 3ee5f671541a811229217c947dda631baa089ffc Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Mon, 3 May 2021 19:52:17 -0400
Subject: [PATCH 01/18] Set generator in dataloader

---
 src/transformers/trainer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index eebea8b4a2dd..de6847319411 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -553,7 +553,13 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
 
         else:
             if self.args.world_size <= 1:
-                return RandomSampler(self.train_dataset)
+                if version.parse(torch.__version__) < version.parse("1.6.0"):
+                    return RandomSampler(self.train_dataset)
+
+                # Torch generator were introduced in PyTorch 1.6.0.
+                generator = torch.Generator()
+                generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
+                return PrintRandomSampler(self.train_dataset, generator=generator)
             elif (
                 self.args.parallel_mode in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
                 and not self.args.dataloader_drop_last

From ee424d7066a3d900910685ad7beff4c86e8726a1 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Mon, 3 May 2021 20:24:37 -0400
Subject: [PATCH 02/18] Use generator in all random samplers

---
 src/transformers/trainer.py          | 13 ++++++++++++-
 src/transformers/trainer_pt_utils.py |  4 +++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index de6847319411..610a17f05d20 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -537,8 +537,19 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
                 lengths = None
             model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
             if self.args.world_size <= 1:
+                if version.parse(torch.__version__) < version.parse("1.6.0"):
+                    generator = None
+                else:
+                    # Torch generator were introduced in PyTorch 1.6.0.
+                    generator = torch.Generator()
+                    generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
+
                 return LengthGroupedSampler(
-                    self.train_dataset, self.args.train_batch_size, lengths=lengths, model_input_name=model_input_name
+                    self.train_dataset,
+                    self.args.train_batch_size,
+                    lengths=lengths,
+                    model_input_name=model_input_name,
+                    generator=generator,
                 )
             else:
                 return DistributedLengthGroupedSampler(
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 62cc1aa480d3..66cc3735a520 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -510,6 +510,7 @@ def __init__(
         batch_size: int,
         lengths: Optional[List[int]] = None,
         model_input_name: Optional[str] = None,
+        generator=None,
     ):
         self.dataset = dataset
         self.batch_size = batch_size
@@ -525,12 +526,13 @@ def __init__(
                 )
             lengths = [len(feature[self.model_input_name]) for feature in dataset]
         self.lengths = lengths
+        self.generator = generator
 
     def __len__(self):
         return len(self.lengths)
 
     def __iter__(self):
-        indices = get_length_grouped_indices(self.lengths, self.batch_size)
+        indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=self.generator)
         return iter(indices)
 
 

From 4c2220e9df200e01ed6c37a22f62e1642d01e404 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 09:04:12 -0400
Subject: [PATCH 03/18] Checkpoint all RNG states

---
 src/transformers/trainer.py | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 610a17f05d20..3037bc2b293e 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -182,6 +182,16 @@
 logger = logging.get_logger(__name__)
 
 
+def recursive_print(state_dict, prefix=""):
+    for key, value in state_dict.items():
+        if isinstance(value, dict):
+            recursive_print(value, prefix=key)
+        elif isinstance(value, torch.Tensor):
+            print(f"{prefix}/{key}: {value.shape}, {value.view(-1,).tolist()[:10]}")
+        else:
+            print(f"{prefix}/{key}: {value}")
+
+
 class Trainer:
     """
     Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
@@ -570,7 +580,7 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
                 # Torch generator were introduced in PyTorch 1.6.0.
                 generator = torch.Generator()
                 generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
-                return PrintRandomSampler(self.train_dataset, generator=generator)
+                return RandomSampler(self.train_dataset, generator=generator)
             elif (
                 self.args.parallel_mode in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
                 and not self.args.dataloader_drop_last
@@ -1182,6 +1192,11 @@ def train(
                 if self.is_local_process_zero() and not args.disable_tqdm:
                     steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
                     steps_trained_progress_bar.set_description("Skipping the first batches")
+        
+        # RNG states
+        checkpoint_rng_state = None
+        if resume_from_checkpoint is not None and os.path.isfile(os.path.join(resume_from_checkpoint, "rng_state.pth")):
+            checkpoint_rng_state = torch.load(os.path.join(resume_from_checkpoint, "rng_state.pth"))
 
         # Update the references
         self.callback_handler.model = self.model
@@ -1212,7 +1227,7 @@ def train(
                 # We just need to begin an iteration to create the randomization of the sampler.
                 for _ in train_dataloader:
                     break
-
+        
         for epoch in range(epochs_trained, num_train_epochs):
             if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
                 train_dataloader.sampler.set_epoch(epoch)
@@ -1241,6 +1256,11 @@ def train(
                     steps_trained_in_current_epoch -= 1
                     if steps_trained_progress_bar is not None:
                         steps_trained_progress_bar.update(1)
+                    if steps_trained_in_current_epoch == 0 and checkpoint_rng_state is not None:
+                        # We're finished skipping so set the RNG states to be exactly as they were at the
+                        # checkpoint time.
+                        torch.random.set_rng_state(checkpoint_rng_state["cpu"])
+                        torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
                     continue
                 elif steps_trained_progress_bar is not None:
                     steps_trained_progress_bar.close()
@@ -1475,6 +1495,12 @@ def _save_checkpoint(self, model, trial, metrics=None):
         # Maybe delete some older checkpoints.
         if self.is_world_process_zero():
             self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
+        
+        # Save RNG state in non-distributed training
+        if self.args.world_size <= 1:
+            rng_states = {"cpu": torch.random.get_rng_state(), "cuda": torch.cuda.random.get_rng_state()}
+            torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
+        
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         """If optimizer and scheduler states exist, load them."""
@@ -2367,7 +2393,7 @@ def push_to_hub(
         with tempfile.TemporaryDirectory() as tmp_dir:
             for f in os.listdir(save_directory):
                 fname = os.path.join(save_directory, f)
-                if os.path.isfile(fname):
+                if os.path.isfile(fname) and fname != "rng_state.pth":
                     shutil.copy(fname, os.path.join(tmp_dir, f))
 
             return unwrap_model(self.model)._push_to_hub(

From d915bbc89b7215ddb8c2cb0cdc462d436c9d2a24 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 10:49:26 -0400
Subject: [PATCH 04/18] Final version

---
 src/transformers/trainer.py | 42 ++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 3037bc2b293e..f2ae1978e9b6 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1192,10 +1192,12 @@ def train(
                 if self.is_local_process_zero() and not args.disable_tqdm:
                     steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
                     steps_trained_progress_bar.set_description("Skipping the first batches")
-        
+
         # RNG states
         checkpoint_rng_state = None
-        if resume_from_checkpoint is not None and os.path.isfile(os.path.join(resume_from_checkpoint, "rng_state.pth")):
+        if resume_from_checkpoint is not None and os.path.isfile(
+            os.path.join(resume_from_checkpoint, "rng_state.pth")
+        ):
             checkpoint_rng_state = torch.load(os.path.join(resume_from_checkpoint, "rng_state.pth"))
 
         # Update the references
@@ -1227,7 +1229,7 @@ def train(
                 # We just need to begin an iteration to create the randomization of the sampler.
                 for _ in train_dataloader:
                     break
-        
+
         for epoch in range(epochs_trained, num_train_epochs):
             if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
                 train_dataloader.sampler.set_epoch(epoch)
@@ -1260,7 +1262,24 @@ def train(
                         # We're finished skipping so set the RNG states to be exactly as they were at the
                         # checkpoint time.
                         torch.random.set_rng_state(checkpoint_rng_state["cpu"])
-                        torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
+                        if torch.cuda.is_available():
+                            if args.local_rank != -1:
+                                if f"cuda_{args.local_rank}" not in checkpoint_rng_state:
+                                    logger.warn(
+                                        "You are resuming a training that was launched in a distributed fashion in a "
+                                        "non-distributed way. Reproducibility cannot be guaranteed."
+                                    )
+                                else:
+                                    torch.cuda.random.set_rng_state(checkpoint_rng_state[f"cuda_{args.local_rank}"])
+                            else:
+                                if f"cuda" not in checkpoint_rng_state:
+                                    logger.warn(
+                                        "You are resuming a training that was launched in a non-distributed fashion "
+                                        "with GPUs on either in a distributed fashion or not on GPUs. Reproducibility "
+                                        "cannot be guaranteed."
+                                    )
+                                else:
+                                    torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
                     continue
                 elif steps_trained_progress_bar is not None:
                     steps_trained_progress_bar.close()
@@ -1495,12 +1514,19 @@ def _save_checkpoint(self, model, trial, metrics=None):
         # Maybe delete some older checkpoints.
         if self.is_world_process_zero():
             self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
-        
+
         # Save RNG state in non-distributed training
-        if self.args.world_size <= 1:
-            rng_states = {"cpu": torch.random.get_rng_state(), "cuda": torch.cuda.random.get_rng_state()}
+        if self.is_local_process_zero():
+            rng_states = {"cpu": torch.random.get_rng_state()}
+            if torch.cuda.is_available():
+                if self.args.local_rank == -1:
+                    # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
+                    rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
+                else:
+                    # In distributed, we save the CUDA RNG states individually.
+                    for i in range(torch.cuda.device_count()):
+                        rng_states[f"cuda_{i}"] = torch.cuda.random.get_rng_state(i)
             torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
-        
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         """If optimizer and scheduler states exist, load them."""

From 1d5a16ccae5ffe2c2ffe0a99a605ef8b96336fc1 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 11:50:39 -0400
Subject: [PATCH 05/18] Quality

---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f2ae1978e9b6..85d1238e79c0 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1272,7 +1272,7 @@ def train(
                                 else:
                                     torch.cuda.random.set_rng_state(checkpoint_rng_state[f"cuda_{args.local_rank}"])
                             else:
-                                if f"cuda" not in checkpoint_rng_state:
+                                if "cuda" not in checkpoint_rng_state:
                                     logger.warn(
                                         "You are resuming a training that was launched in a non-distributed fashion "
                                         "with GPUs on either in a distributed fashion or not on GPUs. Reproducibility "

From 10f0b9a7a736d84435ed0c76253bc7c9783ac8c7 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 12:07:29 -0400
Subject: [PATCH 06/18] Test

---
 examples/pytorch/test_examples.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/pytorch/test_examples.py b/examples/pytorch/test_examples.py
index 1547fc84d714..5d4f0c24c1a5 100644
--- a/examples/pytorch/test_examples.py
+++ b/examples/pytorch/test_examples.py
@@ -204,7 +204,6 @@ def test_run_ner(self):
             run_ner.main()
             result = get_results(tmp_dir)
             self.assertGreaterEqual(result["eval_accuracy"], 0.75)
-            self.assertGreaterEqual(result["eval_precision"], 0.75)
             self.assertLess(result["eval_loss"], 0.5)
 
     def test_run_squad(self):

From 108989a1e45ae5d7c2f08d40ee2e3cd4b050790b Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 12:14:41 -0400
Subject: [PATCH 07/18] Address review comments

---
 src/transformers/trainer.py | 77 +++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 42 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 85d1238e79c0..a1a491bc3f8e 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -127,6 +127,7 @@
 from .utils.modeling_auto_mapping import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
 
 
+_is_torch_generator_available = False
 _is_native_amp_available = False
 
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
@@ -141,6 +142,7 @@
     from apex import amp
 
 if version.parse(torch.__version__) >= version.parse("1.6"):
+    _is_torch_generator_available = True
     _is_native_amp_available = True
     from torch.cuda.amp import autocast
 
@@ -535,6 +537,11 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
         if not isinstance(self.train_dataset, collections.abc.Sized):
             return None
 
+        generator = None
+        if self.args.world_size <= 1 and _is_torch_generator_available:
+            generator = torch.Generator()
+            generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
+
         # Build the sampler.
         if self.args.group_by_length:
             if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
@@ -547,13 +554,6 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
                 lengths = None
             model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
             if self.args.world_size <= 1:
-                if version.parse(torch.__version__) < version.parse("1.6.0"):
-                    generator = None
-                else:
-                    # Torch generator were introduced in PyTorch 1.6.0.
-                    generator = torch.Generator()
-                    generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
-
                 return LengthGroupedSampler(
                     self.train_dataset,
                     self.args.train_batch_size,
@@ -574,12 +574,6 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
 
         else:
             if self.args.world_size <= 1:
-                if version.parse(torch.__version__) < version.parse("1.6.0"):
-                    return RandomSampler(self.train_dataset)
-
-                # Torch generator were introduced in PyTorch 1.6.0.
-                generator = torch.Generator()
-                generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
                 return RandomSampler(self.train_dataset, generator=generator)
             elif (
                 self.args.parallel_mode in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
@@ -1193,13 +1187,6 @@ def train(
                     steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
                     steps_trained_progress_bar.set_description("Skipping the first batches")
 
-        # RNG states
-        checkpoint_rng_state = None
-        if resume_from_checkpoint is not None and os.path.isfile(
-            os.path.join(resume_from_checkpoint, "rng_state.pth")
-        ):
-            checkpoint_rng_state = torch.load(os.path.join(resume_from_checkpoint, "rng_state.pth"))
-
         # Update the references
         self.callback_handler.model = self.model
         self.callback_handler.optimizer = self.optimizer
@@ -1258,28 +1245,8 @@ def train(
                     steps_trained_in_current_epoch -= 1
                     if steps_trained_progress_bar is not None:
                         steps_trained_progress_bar.update(1)
-                    if steps_trained_in_current_epoch == 0 and checkpoint_rng_state is not None:
-                        # We're finished skipping so set the RNG states to be exactly as they were at the
-                        # checkpoint time.
-                        torch.random.set_rng_state(checkpoint_rng_state["cpu"])
-                        if torch.cuda.is_available():
-                            if args.local_rank != -1:
-                                if f"cuda_{args.local_rank}" not in checkpoint_rng_state:
-                                    logger.warn(
-                                        "You are resuming a training that was launched in a distributed fashion in a "
-                                        "non-distributed way. Reproducibility cannot be guaranteed."
-                                    )
-                                else:
-                                    torch.cuda.random.set_rng_state(checkpoint_rng_state[f"cuda_{args.local_rank}"])
-                            else:
-                                if "cuda" not in checkpoint_rng_state:
-                                    logger.warn(
-                                        "You are resuming a training that was launched in a non-distributed fashion "
-                                        "with GPUs on either in a distributed fashion or not on GPUs. Reproducibility "
-                                        "cannot be guaranteed."
-                                    )
-                                else:
-                                    torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
+                    if steps_trained_in_current_epoch == 0:
+                        self._load_rng_state(resume_from_checkpoint)
                     continue
                 elif steps_trained_progress_bar is not None:
                     steps_trained_progress_bar.close()
@@ -1437,6 +1404,32 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch):
             self._save_checkpoint(model, trial, metrics=metrics)
             self.control = self.callback_handler.on_save(self.args, self.state, self.control)
 
+    def _load_rng_state(self, checkpoint):
+        # Load RNG states from `checkpoint`
+        if resume_from_checkpoint is None or not os.path.isfile(os.path.join(checkpoint, "rng_state.pth")):
+            return
+
+        checkpoint_rng_state = torch.load(os.path.join(checkpoint, "rng_state.pth"))
+        torch.random.set_rng_state(checkpoint_rng_state["cpu"])
+        if torch.cuda.is_available():
+            if args.local_rank != -1:
+                if f"cuda_{args.local_rank}" not in checkpoint_rng_state:
+                    logger.warn(
+                        "You are resuming a training that was launched in a distributed fashion in a "
+                        "non-distributed way. Reproducibility cannot be guaranteed."
+                    )
+                else:
+                    torch.cuda.random.set_rng_state(checkpoint_rng_state[f"cuda_{args.local_rank}"])
+            else:
+                if "cuda" not in checkpoint_rng_state:
+                    logger.warn(
+                        "You are resuming a training that was launched in a non-distributed fashion "
+                        "with GPUs on either in a distributed fashion or not on GPUs. Reproducibility "
+                        "cannot be guaranteed."
+                    )
+                else:
+                    torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
+
     def _save_checkpoint(self, model, trial, metrics=None):
         # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
         # want to save except FullyShardedDDP.

From 5cad1f4b177ca99991b7b2bc9adcd0533af35d60 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 12:15:11 -0400
Subject: [PATCH 08/18] Quality

---
 src/transformers/trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index a1a491bc3f8e..c1b6893972f8 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1406,20 +1406,20 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch):
 
     def _load_rng_state(self, checkpoint):
         # Load RNG states from `checkpoint`
-        if resume_from_checkpoint is None or not os.path.isfile(os.path.join(checkpoint, "rng_state.pth")):
+        if checkpoint is None or not os.path.isfile(os.path.join(checkpoint, "rng_state.pth")):
             return
 
         checkpoint_rng_state = torch.load(os.path.join(checkpoint, "rng_state.pth"))
         torch.random.set_rng_state(checkpoint_rng_state["cpu"])
         if torch.cuda.is_available():
-            if args.local_rank != -1:
-                if f"cuda_{args.local_rank}" not in checkpoint_rng_state:
+            if self.args.local_rank != -1:
+                if f"cuda_{self.args.local_rank}" not in checkpoint_rng_state:
                     logger.warn(
                         "You are resuming a training that was launched in a distributed fashion in a "
                         "non-distributed way. Reproducibility cannot be guaranteed."
                     )
                 else:
-                    torch.cuda.random.set_rng_state(checkpoint_rng_state[f"cuda_{args.local_rank}"])
+                    torch.cuda.random.set_rng_state(checkpoint_rng_state[f"cuda_{self.args.local_rank}"])
             else:
                 if "cuda" not in checkpoint_rng_state:
                     logger.warn(

From 75009c8cdb7f87d8d5a6d5a1b90b48ac7404209c Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 12:25:44 -0400
Subject: [PATCH 09/18] Remove debug util

---
 src/transformers/trainer.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index c1b6893972f8..468387854371 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -184,16 +184,6 @@
 logger = logging.get_logger(__name__)
 
 
-def recursive_print(state_dict, prefix=""):
-    for key, value in state_dict.items():
-        if isinstance(value, dict):
-            recursive_print(value, prefix=key)
-        elif isinstance(value, torch.Tensor):
-            print(f"{prefix}/{key}: {value.shape}, {value.view(-1,).tolist()[:10]}")
-        else:
-            print(f"{prefix}/{key}: {value}")
-
-
 class Trainer:
     """
     Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.

From 432397934e616aa4cb1c75e93f18ec2cbd934068 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 12:48:22 -0400
Subject: [PATCH 10/18] Add python and numpy RNGs

---
 src/transformers/trainer.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 468387854371..239f61a37a30 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -20,6 +20,7 @@
 import inspect
 import math
 import os
+import random
 import re
 import shutil
 import sys
@@ -1400,6 +1401,8 @@ def _load_rng_state(self, checkpoint):
             return
 
         checkpoint_rng_state = torch.load(os.path.join(checkpoint, "rng_state.pth"))
+        random.setstate(checkpoint_rng_state["python"])
+        np.random.set_state(checkpoint_rng_state["numpy"])
         torch.random.set_rng_state(checkpoint_rng_state["cpu"])
         if torch.cuda.is_available():
             if self.args.local_rank != -1:
@@ -1500,7 +1503,11 @@ def _save_checkpoint(self, model, trial, metrics=None):
 
         # Save RNG state in non-distributed training
         if self.is_local_process_zero():
-            rng_states = {"cpu": torch.random.get_rng_state()}
+            rng_states = {
+                "python": random.getstate(),
+                "numpy": np.random.get_state(),
+                "cpu": torch.random.get_rng_state(),
+            }
             if torch.cuda.is_available():
                 if self.args.local_rank == -1:
                     # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)

From d1da404b9fec67e2a654e9e41f0b2aaa9ea136d2 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 13:05:18 -0400
Subject: [PATCH 11/18] Split states in different files in distributed

---
 src/transformers/trainer.py | 74 +++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 239f61a37a30..4abd4e5ade58 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1397,31 +1397,37 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch):
 
     def _load_rng_state(self, checkpoint):
         # Load RNG states from `checkpoint`
-        if checkpoint is None or not os.path.isfile(os.path.join(checkpoint, "rng_state.pth")):
+        if checkpoint is None:
             return
 
-        checkpoint_rng_state = torch.load(os.path.join(checkpoint, "rng_state.pth"))
+        if self.args.local_rank != -1:
+            rng_file = os.path.join(checkpoint, f"rng_state_{self.args.local_rank}.pth")
+            if not os.path.isfile(os.path.join(checkpoint, rng_file)):
+                logger.info(
+                    f"Didn't find an RNG file for process {self.args.local_rank}, if you are resuming a training that "
+                    "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
+                )
+                return
+        else:
+            rng_file = os.path.join(checkpoint, f"rng_state.pth")
+            if not os.path.isfile(os.path.join(checkpoint, rng_file)):
+                logger.info(
+                    f"Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
+                    "fashion, reproducibility is not guaranteed."
+                )
+                return
+
+        checkpoint_rng_state = torch.load(rng_file)
         random.setstate(checkpoint_rng_state["python"])
         np.random.set_state(checkpoint_rng_state["numpy"])
         torch.random.set_rng_state(checkpoint_rng_state["cpu"])
         if torch.cuda.is_available():
             if self.args.local_rank != -1:
-                if f"cuda_{self.args.local_rank}" not in checkpoint_rng_state:
-                    logger.warn(
-                        "You are resuming a training that was launched in a distributed fashion in a "
-                        "non-distributed way. Reproducibility cannot be guaranteed."
-                    )
-                else:
-                    torch.cuda.random.set_rng_state(checkpoint_rng_state[f"cuda_{self.args.local_rank}"])
+                torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
             else:
-                if "cuda" not in checkpoint_rng_state:
-                    logger.warn(
-                        "You are resuming a training that was launched in a non-distributed fashion "
-                        "with GPUs on either in a distributed fashion or not on GPUs. Reproducibility "
-                        "cannot be guaranteed."
-                    )
-                else:
-                    torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
+                torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
+        if is_torch_tpu_available():
+            xm.set_rng_state(checkpoint_rng_state["xla"])
 
     def _save_checkpoint(self, model, trial, metrics=None):
         # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
@@ -1502,21 +1508,25 @@ def _save_checkpoint(self, model, trial, metrics=None):
             self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
 
         # Save RNG state in non-distributed training
-        if self.is_local_process_zero():
-            rng_states = {
-                "python": random.getstate(),
-                "numpy": np.random.get_state(),
-                "cpu": torch.random.get_rng_state(),
-            }
-            if torch.cuda.is_available():
-                if self.args.local_rank == -1:
-                    # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
-                    rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
-                else:
-                    # In distributed, we save the CUDA RNG states individually.
-                    for i in range(torch.cuda.device_count()):
-                        rng_states[f"cuda_{i}"] = torch.cuda.random.get_rng_state(i)
+        rng_states = {
+            "python": random.getstate(),
+            "numpy": np.random.get_state(),
+            "cpu": torch.random.get_rng_state(),
+        }
+        if torch.cuda.is_available():
+            if self.args.local_rank == -1:
+                # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
+                rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
+            else:
+                rng_states[f"cuda"] = torch.cuda.random.get_rng_state()
+
+        if is_torch_tpu_available():
+            rng_states[f"xla"] = xm.get_rng_state().item()
+
+        if self.args.local_rank == -1:
             torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
+        else:
+            torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.local_rank}.pth"))
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         """If optimizer and scheduler states exist, load them."""
@@ -2409,7 +2419,7 @@ def push_to_hub(
         with tempfile.TemporaryDirectory() as tmp_dir:
             for f in os.listdir(save_directory):
                 fname = os.path.join(save_directory, f)
-                if os.path.isfile(fname) and fname != "rng_state.pth":
+                if os.path.isfile(fname):
                     shutil.copy(fname, os.path.join(tmp_dir, f))
 
             return unwrap_model(self.model)._push_to_hub(

From 8374498669756d4775bd117c6f6e3c4f5cd10435 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 13:24:53 -0400
Subject: [PATCH 12/18] Quality

---
 src/transformers/trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4abd4e5ade58..760b8721d937 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1409,10 +1409,10 @@ def _load_rng_state(self, checkpoint):
                 )
                 return
         else:
-            rng_file = os.path.join(checkpoint, f"rng_state.pth")
+            rng_file = os.path.join(checkpoint, "rng_state.pth")
             if not os.path.isfile(os.path.join(checkpoint, rng_file)):
                 logger.info(
-                    f"Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
+                    "Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
                     "fashion, reproducibility is not guaranteed."
                 )
                 return
@@ -1518,10 +1518,10 @@ def _save_checkpoint(self, model, trial, metrics=None):
                 # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
                 rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
             else:
-                rng_states[f"cuda"] = torch.cuda.random.get_rng_state()
+                rng_states["cuda"] = torch.cuda.random.get_rng_state()
 
         if is_torch_tpu_available():
-            rng_states[f"xla"] = xm.get_rng_state().item()
+            rng_states["xla"] = xm.get_rng_state()
 
         if self.args.local_rank == -1:
             torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))

From 1995ff2522b76c507fe3e9af4520871de49f986c Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 14:04:19 -0400
Subject: [PATCH 13/18] local_rank for TPUs

---
 src/transformers/trainer.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 760b8721d937..57a63568e07f 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1400,11 +1400,12 @@ def _load_rng_state(self, checkpoint):
         if checkpoint is None:
             return
 
-        if self.args.local_rank != -1:
-            rng_file = os.path.join(checkpoint, f"rng_state_{self.args.local_rank}.pth")
+        local_rank = xm.get_local_ordinal() if is_torch_tpu_available() else self.args.local_rank
+        if local_rank != -1:
+            rng_file = os.path.join(checkpoint, f"rng_state_{local_rank}.pth")
             if not os.path.isfile(os.path.join(checkpoint, rng_file)):
                 logger.info(
-                    f"Didn't find an RNG file for process {self.args.local_rank}, if you are resuming a training that "
+                    f"Didn't find an RNG file for process {local_rank}, if you are resuming a training that "
                     "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
                 )
                 return
@@ -1523,10 +1524,11 @@ def _save_checkpoint(self, model, trial, metrics=None):
         if is_torch_tpu_available():
             rng_states["xla"] = xm.get_rng_state()
 
-        if self.args.local_rank == -1:
+        local_rank = xm.get_local_ordinal() if is_torch_tpu_available() else self.args.local_rank
+        if local_rank == -1:
             torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
         else:
-            torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.local_rank}.pth"))
+            torch.save(rng_states, os.path.join(output_dir, f"rng_state_{local_rank}.pth"))
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         """If optimizer and scheduler states exist, load them."""

From 2b0c7d9f6f8908b792861e8835532cf86b089708 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 14:37:07 -0400
Subject: [PATCH 14/18] Only use generator when accepted

---
 src/transformers/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 57a63568e07f..a97b566104c1 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -565,7 +565,9 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
 
         else:
             if self.args.world_size <= 1:
-                return RandomSampler(self.train_dataset, generator=generator)
+                if _is_torch_generator_available:
+                    return RandomSampler(self.train_dataset, generator=generator)
+                return RandomSampler(self.train_dataset)
             elif (
                 self.args.parallel_mode in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
                 and not self.args.dataloader_drop_last

From 4ec3c99a95641cd4b01407fb8500ce190c764fb2 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 15:03:54 -0400
Subject: [PATCH 15/18] Add test

---
 tests/test_trainer.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 68a15ae67350..7b0efa2de47d 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -15,7 +15,9 @@
 
 import dataclasses
 import gc
+import math
 import os
+import random
 import re
 import tempfile
 import unittest
@@ -195,6 +197,26 @@ def forward(self, input_x, labels=None, **kwargs):
             loss = torch.nn.functional.mse_loss(y, labels)
             return (loss, y, y) if self.double_output else (loss, y)
 
+    class RegressionRandomPreTrainedModel(PreTrainedModel):
+        config_class = RegressionModelConfig
+        base_model_prefix = "regression"
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.a = torch.nn.Parameter(torch.tensor(config.a).float())
+            self.b = torch.nn.Parameter(torch.tensor(config.b).float())
+
+        def forward(self, input_x, labels=None, **kwargs):
+            y = input_x * self.a + self.b
+            if self.training:
+                # Add random noise from torch, numpy and random
+                y += 0.05 * torch.randn(1).squeeze() + 0.05 * torch.tensor(np.random.rand() + random.random())
+
+            if labels is None:
+                return (y,)
+            loss = torch.nn.functional.mse_loss(y, labels)
+            return (loss, y)
+
     class TstLayer(torch.nn.Module):
         def __init__(self, hidden_size):
             super().__init__()
@@ -699,6 +721,27 @@ def test_can_resume_training(self):
             trainer.train(resume_from_checkpoint=True)
         self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
 
+    def test_resume_training_with_randomness(self):
+        train_dataset = RegressionDataset(length=128)
+        eval_dataset = RegressionDataset()
+
+        config = RegressionModelConfig(a=0, b=2)
+        model = RegressionRandomPreTrainedModel(config)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        args = RegressionTrainingArguments(tmp_dir, save_steps=5, learning_rate=0.1)
+        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+
+        trainer.train()
+        (a, b) = trainer.model.a.item(), trainer.model.b.item()
+
+        model = RegressionRandomPreTrainedModel(config)
+        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+        trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, "checkpoint-15"))
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        self.assertTrue(math.isclose(a, a1, rel_tol=1e-4))
+        self.assertTrue(math.isclose(b, b1, rel_tol=1e-4))
+
     def test_resume_training_with_gradient_accumulation(self):
         if torch.cuda.device_count() > 2:
             # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of

From 4b8f370f74418cee8e30bb18ee26ac3359c31e64 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 15:30:04 -0400
Subject: [PATCH 16/18] Set seed to avoid flakiness

---
 tests/test_trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 7b0efa2de47d..e268306d53bb 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -45,6 +45,7 @@
     require_torch_multi_gpu,
     slow,
 )
+from transformers.trainer_utils import set_seed
 from transformers.utils.hp_naming import TrialShortNamer
 
 
@@ -208,9 +209,7 @@ def __init__(self, config):
 
         def forward(self, input_x, labels=None, **kwargs):
             y = input_x * self.a + self.b
-            if self.training:
-                # Add random noise from torch, numpy and random
-                y += 0.05 * torch.randn(1).squeeze() + 0.05 * torch.tensor(np.random.rand() + random.random())
+            y += 0.05 * torch.randn(1).squeeze() + 0.05 * torch.tensor(np.random.rand() + random.random())
 
             if labels is None:
                 return (y,)
@@ -722,6 +721,7 @@ def test_can_resume_training(self):
         self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
 
     def test_resume_training_with_randomness(self):
+        set_seed(63)
         train_dataset = RegressionDataset(length=128)
         eval_dataset = RegressionDataset()
 

From 08eb7135f766acef9ff94025e935e7b8ab1bc060 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 15:49:01 -0400
Subject: [PATCH 17/18] Make test less flaky

---
 tests/test_trainer.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index e268306d53bb..09ab2e7a3b36 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -209,7 +209,11 @@ def __init__(self, config):
 
         def forward(self, input_x, labels=None, **kwargs):
             y = input_x * self.a + self.b
-            y += 0.05 * torch.randn(1).squeeze() + 0.05 * torch.tensor(np.random.rand() + random.random())
+            torch_rand = torch.randn(1).squeeze()
+            np_rand = np.random.rand()
+            rand_rand = random.random()
+
+            y += 0.05 * torch_rand + 0.05 * torch.tensor(np_rand + rand_rand)
 
             if labels is None:
                 return (y,)
@@ -721,7 +725,12 @@ def test_can_resume_training(self):
         self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
 
     def test_resume_training_with_randomness(self):
-        set_seed(63)
+        if torch.cuda.device_count() >= 2:
+            # This test will fail flakily for more than 2 GPUs since the result will be slightly more different.
+            return
+
+        if torch.cuda.is_available():
+            torch.backends.cudnn.deterministic = True
         train_dataset = RegressionDataset(length=128)
         eval_dataset = RegressionDataset()
 
@@ -739,8 +748,9 @@ def test_resume_training_with_randomness(self):
         trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
         trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, "checkpoint-15"))
         (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-        self.assertTrue(math.isclose(a, a1, rel_tol=1e-4))
-        self.assertTrue(math.isclose(b, b1, rel_tol=1e-4))
+
+        self.assertTrue(math.isclose(a, a1, rel_tol=1e-8))
+        self.assertTrue(math.isclose(b, b1, rel_tol=1e-8))
 
     def test_resume_training_with_gradient_accumulation(self):
         if torch.cuda.device_count() > 2:

From f7849d130ad22ad40067a249d71e9843c0ce73a1 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 4 May 2021 15:54:23 -0400
Subject: [PATCH 18/18] Quality

---
 tests/test_trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 09ab2e7a3b36..c040333a83bc 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -45,7 +45,6 @@
     require_torch_multi_gpu,
     slow,
 )
-from transformers.trainer_utils import set_seed
 from transformers.utils.hp_naming import TrialShortNamer