From 65866583b727fdbfd5a00d6783e8b073556a4ece Mon Sep 17 00:00:00 2001
From: jakeross <jirhiker@gmail.com>
Date: Sun, 10 May 2026 11:17:22 -0600
Subject: [PATCH 1/2] feat(cloud): retry transient 5xx during device-code poll
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Real-world deploy hit a 502 on the third poll after admin approval
(Forgejo SSH-key registration upstream blip). Server-side the
device_code row was rolled back to approved + unconsumed so a retry
would succeed, but the workstation surfaced ``CloudAPIError`` →
``Enrollment failed`` and bailed.

Adds a small retry budget on transient errors inside the poll loop
in ``WorkstationSetup.from_device_code``: a ``CloudNetworkError`` or
``CloudAPIError`` no longer terminates the flow — it sleeps the
configured ``interval_seconds`` and re-polls. After
``_POLL_TRANSIENT_RETRY_LIMIT`` (= 6) consecutive transient failures
the original exception propagates so a persistent outage doesn't
spin forever.

The terminal RFC 8628 states (denied / expired / fingerprint
rejected) still propagate immediately. Pending state still resets
the transient counter.

Tests:
- ``test_transient_502_during_poll_is_retried`` — pending → 502 →
  502 → 200, three sleeps, success.
- ``test_persistent_5xx_eventually_propagates`` —
  ``RETRY_LIMIT + 2`` consecutive 502s → CloudAPIError raised.

138 → 140 cloud tests, all green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pychron/cloud/workstation_setup.py   | 35 +++++++++++++++++++
 test/cloud/test_device_code_setup.py | 51 ++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/pychron/cloud/workstation_setup.py b/pychron/cloud/workstation_setup.py
index a06e875d2..aa05a2541 100644
--- a/pychron/cloud/workstation_setup.py
+++ b/pychron/cloud/workstation_setup.py
@@ -53,6 +53,7 @@
     CloudDeviceCodeExpired,
     CloudDeviceCodePending,
     CloudFingerprintRejected,
+    CloudNetworkError,
     poll_device_code,
     register_ssh_key,
     revoke_workstation_token,
@@ -82,6 +83,15 @@
 
 logger = logging.getLogger(__name__)
 
+# Retry budget for transient poll failures. The server-side mint
+# (Forgejo bot create + ssh-key add) is a multi-step network
+# operation that occasionally trips on upstream timeouts. The
+# device-code row stays approved + unconsumed when the mint rolls
+# back, so the workstation can retry without re-bothering the admin.
+# After this many CONSECUTIVE transient failures we give up and
+# surface the error so the operator knows something is broken.
+_POLL_TRANSIENT_RETRY_LIMIT = 6
+
 
 class WorkstationSetupError(Exception):
     """Raised when onboarding cannot complete."""
@@ -184,6 +194,7 @@ def from_device_code(
 
         interval = max(1, int(start.interval_seconds or 5))
         cancel = should_cancel or (lambda: False)
+        transient_failures = 0
 
         while True:
             if cancel():
@@ -191,6 +202,7 @@ def from_device_code(
             try:
                 success = poll_device_code(api_base_url, start.device_code)
             except CloudDeviceCodePending:
+                transient_failures = 0
                 sleep(interval)
                 continue
             except CloudDeviceCodeDenied:
@@ -199,6 +211,29 @@ def from_device_code(
                 raise
             except CloudFingerprintRejected:
                 raise
+            except (CloudNetworkError, CloudAPIError) as exc:
+                # 5xx / network blip during the mint rolls back the
+                # device_code on the server (still approved + unconsumed),
+                # so re-polling is the right move. Retry with the same
+                # cadence; bail after ``_POLL_TRANSIENT_RETRY_LIMIT``
+                # consecutive failures so a persistent outage doesn't
+                # spin forever.
+                transient_failures += 1
+                if transient_failures > _POLL_TRANSIENT_RETRY_LIMIT:
+                    logger.warning(
+                        "device-code poll: %d consecutive transient failures, " "giving up: %s",
+                        transient_failures,
+                        exc,
+                    )
+                    raise
+                logger.info(
+                    "device-code poll transient failure %d/%d: %s",
+                    transient_failures,
+                    _POLL_TRANSIENT_RETRY_LIMIT,
+                    exc,
+                )
+                sleep(interval)
+                continue
             break
 
         if not success.api_token:
diff --git a/test/cloud/test_device_code_setup.py b/test/cloud/test_device_code_setup.py
index 3a21789c0..33777191f 100644
--- a/test/cloud/test_device_code_setup.py
+++ b/test/cloud/test_device_code_setup.py
@@ -235,6 +235,57 @@ def test_keyring_failure_raises_typed_error_token_not_in_str(self):
         # catching the broader type.
         self.assertIsInstance(cm.exception, workstation_setup.WorkstationSetupError)
 
+    def test_transient_502_during_poll_is_retried(self):
+        """A 502 (Forgejo upstream blip during mint) leaves the
+        device-code row approved + unconsumed on the server, so the
+        workstation should retry instead of giving up. After a few
+        retries the next poll succeeds and the enrollment completes
+        normally."""
+        with (
+            patch.object(api_client.requests, "post") as post,
+            patch.object(workstation_setup, "keyring_set_token", return_value=True),
+        ):
+            post.side_effect = [
+                _resp(201, _START_BODY),
+                _resp(425, {}),
+                _resp(502, {"detail": "forgejo upstream error: ..."}),
+                _resp(502, {"detail": "forgejo upstream error: ..."}),
+                _resp(200, _poll_body()),
+            ]
+            sleeps = []
+            setup = workstation_setup.WorkstationSetup.from_device_code(
+                self.URL,
+                on_user_code=lambda *a: None,
+                sleep=lambda s: sleeps.append(s),
+                host="testhost",
+            )
+        # Three sleeps: pending, 502, 502 — final 200 commits.
+        self.assertEqual(sleeps, [1, 1, 1])
+        self.assertEqual(setup.api_token, "pcy_NMGRL_xyz")
+
+    def test_persistent_5xx_eventually_propagates(self):
+        """If the upstream stays broken past the retry budget, the
+        ``CloudAPIError`` must propagate so the UI can surface
+        ``Enrollment failed`` rather than spinning forever."""
+        # Build a side_effect long enough to trip the budget: 1 start
+        # + (RETRY_LIMIT + 2) consecutive 502s.
+        budget = workstation_setup._POLL_TRANSIENT_RETRY_LIMIT
+        responses = [_resp(201, _START_BODY)] + [
+            _resp(502, {"detail": "forgejo upstream error"}) for _ in range(budget + 2)
+        ]
+        with (
+            patch.object(api_client.requests, "post") as post,
+            patch.object(workstation_setup, "keyring_set_token", return_value=True),
+        ):
+            post.side_effect = responses
+            with self.assertRaises(api_client.CloudAPIError):
+                workstation_setup.WorkstationSetup.from_device_code(
+                    self.URL,
+                    on_user_code=lambda *a: None,
+                    sleep=lambda s: None,
+                    host="testhost",
+                )
+
     def test_empty_api_base_url_aborts_before_any_io(self):
         with patch.object(api_client.requests, "post") as post:
             with self.assertRaises(workstation_setup.WorkstationSetupError):

From 076c32b3e6a531e60f7415596bbc4c0012b1788a Mon Sep 17 00:00:00 2001
From: jakeross <jirhiker@gmail.com>
Date: Sun, 10 May 2026 11:21:36 -0600
Subject: [PATCH 2/2] diag(cloud): log full CloudAPIError message in enrollment
 worker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The retry path inside ``from_device_code`` already logs the full
exception, but the outer worker only emitted ``type(exc).__name__``
per the security-review precedent for ``WorkstationSetupError`` —
which can carry a keyring-recovery token in its message.

``CloudAPIError`` does NOT carry token material (no ``api_token`` is
ever stuffed into a CloudAPIError; it only holds the upstream HTTP
status + truncated response body). Splitting the except branches
lets the worker log the full ``CloudAPIError`` message while keeping
the ``WorkstationSetupError`` path type-only for safety.

Useful for diagnosing transient backend failures like the
2026-05-10 17:03 UTC Forgejo upstream blip where workstation logs
only said ``CloudAPIError`` with no detail.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pychron/cloud/tasks/preferences.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/pychron/cloud/tasks/preferences.py b/pychron/cloud/tasks/preferences.py
index e90bc19f9..5c67e2d09 100644
--- a/pychron/cloud/tasks/preferences.py
+++ b/pychron/cloud/tasks/preferences.py
@@ -291,7 +291,21 @@ def _enrollment_worker(self):
             )
             GUI.invoke_later(self._apply_keyring_recovery, exc.lab_name, exc.api_token)
             return
-        except (CloudAPIError, WorkstationSetupError) as exc:
+        except CloudAPIError as exc:
+            # CloudAPIError carries upstream HTTP status + response body
+            # (e.g. ``device-code poll returned HTTP 502: forgejo upstream
+            # error: ...``). Safe to log in full — no plaintext bearer
+            # tokens cross this surface (those are stripped from
+            # ``DeviceCodePollSuccess.raw`` and live on
+            # ``KeyringWriteFailedError.api_token``, which is caught
+            # earlier).
+            logger.warning("device-code enrollment failed: %s", exc)
+            GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red")
+            return
+        except WorkstationSetupError as exc:
+            # ``WorkstationSetupError`` subclasses (esp.
+            # ``KeyringWriteFailedError`` — already caught above) may
+            # carry a token in ``str(exc)``; log only the type name.
             logger.warning("device-code enrollment failed: %s", type(exc).__name__)
             GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red")
             return