From 199217c1ced6814d040b37bd6c592c64f67c4307 Mon Sep 17 00:00:00 2001
From: Charly21r <crrubio02@gmail.com>
Date: Mon, 23 Mar 2026 16:39:14 +0100
Subject: [PATCH 1/6] Fix NotebookProgressCallback to allow evaluate() before
 and after train

---
 src/transformers/utils/notebook.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index ecbe8271fe13..4309c3966e2a 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -351,7 +351,10 @@ def on_log(self, args, state, control, logs=None, **kwargs):
             tt.write_line(values)
 
     def on_evaluate(self, args, state, control, metrics=None, **kwargs):
-        tt = _require(self.training_tracker, "on_train_begin must be called before on_evaluate")
+        if self.training_tracker is None:
+            return control
+
+        tt = self.training_tracker
 
         values = {"Training Loss": "No log", "Validation Loss": "No log"}
         for log in reversed(state.log_history):

From 2e20fcea3b91f4bef141c4945295cc95a3417158 Mon Sep 17 00:00:00 2001
From: Charly21r <crrubio02@gmail.com>
Date: Mon, 23 Mar 2026 16:40:44 +0100
Subject: [PATCH 2/6] Add unit test for NotebookProgressCallback evaluating
 before and after training

---
 tests/trainer/test_trainer_callback.py | 71 ++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index 0d132a9051f5..c640a4f43814 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -1269,3 +1269,74 @@ def state(self):
 
         self.assertEqual(instance.name, "test")
         self.assertEqual(instance.counter, 5)
+
+
+@require_torch
+class NotebookProgressCallbackTest(unittest.TestCase):
+    """Tests for NotebookProgressCallback behavior in notebook environments."""
+
+    def setUp(self):
+        self.output_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.output_dir)
+
+    def _create_trainer(self):
+        train_dataset = RegressionDataset(length=16)
+        eval_dataset = RegressionDataset(length=16)
+        config = RegressionModelConfig(a=0, b=0)
+        model = RegressionPreTrainedModel(config)
+
+        args = TrainingArguments(
+            self.output_dir,
+            per_device_train_batch_size=2,
+            per_device_eval_batch_size=2,
+            num_train_epochs=1,
+            logging_strategy="no",
+            report_to=[],
+            eval_strategy="epoch",
+            disable_tqdm=True,
+        )
+
+        from transformers.utils.notebook import NotebookProgressCallback
+
+        trainer = Trainer(
+            model=model,
+            args=args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            callbacks=[NotebookProgressCallback()],  # force it
+        )
+        return trainer
+
+    def test_evaluate_before_training(self):
+        """Calling evaluate() before training does not crash and returns metrics."""
+        trainer = self._create_trainer()
+        metrics = trainer.evaluate()
+        self.assertIn("eval_loss", metrics)
+        # Check that the notebook callback exists in callback handler
+        from transformers.utils.notebook import NotebookProgressCallback
+
+        cb = next(
+            (c for c in trainer.callback_handler.callbacks if isinstance(c, NotebookProgressCallback)),
+            None,
+        )
+        self.assertIsNotNone(cb)
+
+    def test_evaluate_after_training(self):
+        """Calling evaluate() after training does not crash and returns metrics."""
+        trainer = self._create_trainer()
+        trainer.train()
+        metrics = trainer.evaluate()
+        self.assertIn("eval_loss", metrics)
+
+    def test_multiple_evaluate_calls(self):
+        """Calling evaluate() multiple times in a row works in notebook environment."""
+        trainer = self._create_trainer()
+        metrics1 = trainer.evaluate()
+        trainer.train()
+        metrics2 = trainer.evaluate()
+        metrics3 = trainer.evaluate()
+        self.assertIn("eval_loss", metrics1)
+        self.assertIn("eval_loss", metrics2)
+        self.assertIn("eval_loss", metrics3)

From cb1df8d89857378a7c381d7f028df4c0116fbafc Mon Sep 17 00:00:00 2001
From: Charly21r <crrubio02@gmail.com>
Date: Mon, 23 Mar 2026 18:06:01 +0100
Subject: [PATCH 3/6] Skip NotebookProgressCallback tests when IPython is not
 installed

---
 tests/trainer/test_trainer_callback.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index c640a4f43814..a22faac2588f 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -23,6 +23,7 @@
 - Built-in callbacks (DefaultFlowCallback, EarlyStoppingCallback, etc.)
 """
 
+import importlib.util
 import os
 import shutil
 import tempfile
@@ -53,6 +54,9 @@
     from .trainer_test_utils import RegressionDataset, RegressionModelConfig, RegressionPreTrainedModel
 
 
+IPYTHON_AVAILABLE = importlib.util.find_spec("IPython") is not None
+
+
 # =============================================================================
 # Test Callback Implementations
 # =============================================================================
@@ -1272,6 +1276,7 @@ def state(self):
 
 
 @require_torch
+@unittest.skipUnless(IPYTHON_AVAILABLE, "IPython is required for NotebookProgressCallback")
 class NotebookProgressCallbackTest(unittest.TestCase):
     """Tests for NotebookProgressCallback behavior in notebook environments."""
 

From a2d67e2437e1a99999cfeff0a7ea4f0a02345c46 Mon Sep 17 00:00:00 2001
From: Charly21r <crrubio02@gmail.com>
Date: Tue, 7 Apr 2026 18:00:53 +0200
Subject: [PATCH 4/6] Display eval metrics when training tracker is None on
 NotebookProgressCallback

---
 src/transformers/utils/notebook.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 4309c3966e2a..c07083f75925 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -351,10 +351,7 @@ def on_log(self, args, state, control, logs=None, **kwargs):
             tt.write_line(values)
 
     def on_evaluate(self, args, state, control, metrics=None, **kwargs):
-        if self.training_tracker is None:
-            return control
-
-        tt = self.training_tracker
+        self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step"
 
         values = {"Training Loss": "No log", "Validation Loss": "No log"}
         for log in reversed(state.log_history):
@@ -384,11 +381,18 @@ def on_evaluate(self, args, state, control, metrics=None, **kwargs):
                 # Single dataset
                 name = "Validation Loss"
             values[name] = v
-        tt.write_line(values)
-        tt.remove_child()
+
+        if self.training_tracker is not None:
+            tt = self.training_tracker
+            tt.write_line(values)
+            tt.remove_child()
+            # Evaluation takes a long time so we should force the next update.
+            self._force_next_update = True
+        else:
+            # No training tracker, but still show the metrics
+            disp.display(disp.HTML(text_to_html_table([list(values.keys()), list(values.values())])))
+
         self.prediction_bar = None
-        # Evaluation takes a long time so we should force the next update.
-        self._force_next_update = True
 
     def on_train_end(self, args, state, control, **kwargs):
         tt = _require(self.training_tracker, "on_train_begin must be called before on_train_end")

From 7a01ca96670dc9d8dd752d30844f8e8b8926b328 Mon Sep 17 00:00:00 2001
From: Charly21r <crrubio02@gmail.com>
Date: Thu, 9 Apr 2026 20:09:43 +0200
Subject: [PATCH 5/6] Add is_ipython_available and require_ipython test
 decorator

---
 src/transformers/testing_utils.py      | 6 ++++++
 src/transformers/utils/__init__.py     | 1 +
 src/transformers/utils/import_utils.py | 5 +++++
 tests/trainer/test_trainer_callback.py | 8 ++------
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 6e35a836db16..863242a695c6 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -106,6 +106,7 @@
     is_hadamard_available,
     is_hqq_available,
     is_huggingface_hub_greater_or_equal,
+    is_ipython_available,
     is_jinja_available,
     is_jmespath_available,
     is_jumanpp_available,
@@ -1179,6 +1180,11 @@ def require_faiss(test_case):
     return unittest.skipUnless(is_faiss_available(), "test requires `faiss`")(test_case)
 
 
+def require_ipython(test_case):
+    """Decorator marking a test that requires IPython. These tests are skipped when IPython isn't installed."""
+    return unittest.skipUnless(is_ipython_available(), "test requires `IPython`")(test_case)
+
+
 def require_optuna(test_case):
     """
     Decorator marking a test that requires optuna.
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 3f5c7cac386b..d12e0b277c1b 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -150,6 +150,7 @@
     is_hqq_available,
     is_huggingface_hub_greater_or_equal,
     is_in_notebook,
+    is_ipython_available,
     is_jinja_available,
     is_jmespath_available,
     is_jumanpp_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 1e1ac2545f05..de11d23cbecf 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1540,6 +1540,11 @@ def msg_callable():
         torch._check_with(error_type, cond, msg_callable)
 
 
+@lru_cache
+def is_ipython_available() -> bool:
+    return importlib.util.find_spec("IPython") is not None
+
+
 @lru_cache
 def is_in_notebook() -> bool:
     try:
diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index a22faac2588f..db0ccd56b1a1 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -23,7 +23,6 @@
 - Built-in callbacks (DefaultFlowCallback, EarlyStoppingCallback, etc.)
 """
 
-import importlib.util
 import os
 import shutil
 import tempfile
@@ -44,7 +43,7 @@
     is_torch_available,
 )
 from transformers.integrations.integration_utils import KubeflowCallback, SwanLabCallback
-from transformers.testing_utils import require_torch
+from transformers.testing_utils import require_ipython, require_torch
 from transformers.trainer_callback import CallbackHandler, ExportableState, TrainerControl
 
 
@@ -54,9 +53,6 @@
     from .trainer_test_utils import RegressionDataset, RegressionModelConfig, RegressionPreTrainedModel
 
 
-IPYTHON_AVAILABLE = importlib.util.find_spec("IPython") is not None
-
-
 # =============================================================================
 # Test Callback Implementations
 # =============================================================================
@@ -1276,7 +1272,7 @@ def state(self):
 
 
 @require_torch
-@unittest.skipUnless(IPYTHON_AVAILABLE, "IPython is required for NotebookProgressCallback")
+@require_ipython
 class NotebookProgressCallbackTest(unittest.TestCase):
     """Tests for NotebookProgressCallback behavior in notebook environments."""
 

From 2d987166756408268d910f42a98ff68886a161d5 Mon Sep 17 00:00:00 2001
From: Charly21r <crrubio02@gmail.com>
Date: Fri, 10 Apr 2026 17:30:09 +0200
Subject: [PATCH 6/6] Filter model_preparation_time metric and add code
 comments in on_eval

---
 src/transformers/utils/notebook.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index c07083f75925..1c7fb7a77bea 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -351,6 +351,8 @@ def on_log(self, args, state, control, logs=None, **kwargs):
             tt.write_line(values)
 
     def on_evaluate(self, args, state, control, metrics=None, **kwargs):
+        # Recompute first_column here since on_evaluate can be called before on_train_begin,
+        # where it is normally initialized.
         self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step"
 
         values = {"Training Loss": "No log", "Validation Loss": "No log"}
@@ -374,6 +376,8 @@ def on_evaluate(self, args, state, control, metrics=None, **kwargs):
         _ = metrics.pop(f"{metric_key_prefix}_runtime", None)
         _ = metrics.pop(f"{metric_key_prefix}_samples_per_second", None)
         _ = metrics.pop(f"{metric_key_prefix}_steps_per_second", None)
+        _ = metrics.pop(f"{metric_key_prefix}_model_preparation_time", None)
+
         for k, v in metrics.items():
             splits = k.split("_")
             name = " ".join([part.capitalize() for part in splits[1:]])