From 65866583b727fdbfd5a00d6783e8b073556a4ece Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 10 May 2026 11:17:22 -0600 Subject: [PATCH 1/2] feat(cloud): retry transient 5xx during device-code poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-world deploy hit a 502 on the third poll after admin approval (Forgejo SSH-key registration upstream blip). Server-side the device_code row was rolled back to approved + unconsumed so a retry would succeed, but the workstation surfaced ``CloudAPIError`` → ``Enrollment failed`` and bailed. Adds a small retry budget on transient errors inside the poll loop in ``WorkstationSetup.from_device_code``: a ``CloudNetworkError`` or ``CloudAPIError`` no longer terminates the flow — it sleeps the configured ``interval_seconds`` and re-polls. After ``_POLL_TRANSIENT_RETRY_LIMIT`` (= 6) consecutive transient failures the original exception propagates so a persistent outage doesn't spin forever. The terminal RFC 8628 states (denied / expired / fingerprint rejected) still propagate immediately. Pending state still resets the transient counter. Tests: - ``test_transient_502_during_poll_is_retried`` — pending → 502 → 502 → 200, three sleeps, success. - ``test_persistent_5xx_eventually_propagates`` — ``RETRY_LIMIT + 2`` consecutive 502s → CloudAPIError raised. 138 → 140 cloud tests, all green. Co-Authored-By: Claude Opus 4.7 (1M context) --- pychron/cloud/workstation_setup.py | 35 +++++++++++++++++++ test/cloud/test_device_code_setup.py | 51 ++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/pychron/cloud/workstation_setup.py b/pychron/cloud/workstation_setup.py index a06e875d2..aa05a2541 100644 --- a/pychron/cloud/workstation_setup.py +++ b/pychron/cloud/workstation_setup.py @@ -53,6 +53,7 @@ CloudDeviceCodeExpired, CloudDeviceCodePending, CloudFingerprintRejected, + CloudNetworkError, poll_device_code, register_ssh_key, revoke_workstation_token, @@ -82,6 +83,15 @@ logger = logging.getLogger(__name__) +# Retry budget for transient poll failures. The server-side mint +# (Forgejo bot create + ssh-key add) is a multi-step network +# operation that occasionally trips on upstream timeouts. The +# device-code row stays approved + unconsumed when the mint rolls +# back, so the workstation can retry without re-bothering the admin. +# After this many CONSECUTIVE transient failures we give up and +# surface the error so the operator knows something is broken. +_POLL_TRANSIENT_RETRY_LIMIT = 6 + class WorkstationSetupError(Exception): """Raised when onboarding cannot complete.""" @@ -184,6 +194,7 @@ def from_device_code( interval = max(1, int(start.interval_seconds or 5)) cancel = should_cancel or (lambda: False) + transient_failures = 0 while True: if cancel(): @@ -191,6 +202,7 @@ def from_device_code( try: success = poll_device_code(api_base_url, start.device_code) except CloudDeviceCodePending: + transient_failures = 0 sleep(interval) continue except CloudDeviceCodeDenied: @@ -199,6 +211,29 @@ def from_device_code( raise except CloudFingerprintRejected: raise + except (CloudNetworkError, CloudAPIError) as exc: + # 5xx / network blip during the mint rolls back the + # device_code on the server (still approved + unconsumed), + # so re-polling is the right move. Retry with the same + # cadence; bail after ``_POLL_TRANSIENT_RETRY_LIMIT`` + # consecutive failures so a persistent outage doesn't + # spin forever. + transient_failures += 1 + if transient_failures > _POLL_TRANSIENT_RETRY_LIMIT: + logger.warning( + "device-code poll: %d consecutive transient failures, " "giving up: %s", + transient_failures, + exc, + ) + raise + logger.info( + "device-code poll transient failure %d/%d: %s", + transient_failures, + _POLL_TRANSIENT_RETRY_LIMIT, + exc, + ) + sleep(interval) + continue break if not success.api_token: diff --git a/test/cloud/test_device_code_setup.py b/test/cloud/test_device_code_setup.py index 3a21789c0..33777191f 100644 --- a/test/cloud/test_device_code_setup.py +++ b/test/cloud/test_device_code_setup.py @@ -235,6 +235,57 @@ def test_keyring_failure_raises_typed_error_token_not_in_str(self): # catching the broader type. self.assertIsInstance(cm.exception, workstation_setup.WorkstationSetupError) + def test_transient_502_during_poll_is_retried(self): + """A 502 (Forgejo upstream blip during mint) leaves the + device-code row approved + unconsumed on the server, so the + workstation should retry instead of giving up. After a few + retries the next poll succeeds and the enrollment completes + normally.""" + with ( + patch.object(api_client.requests, "post") as post, + patch.object(workstation_setup, "keyring_set_token", return_value=True), + ): + post.side_effect = [ + _resp(201, _START_BODY), + _resp(425, {}), + _resp(502, {"detail": "forgejo upstream error: ..."}), + _resp(502, {"detail": "forgejo upstream error: ..."}), + _resp(200, _poll_body()), + ] + sleeps = [] + setup = workstation_setup.WorkstationSetup.from_device_code( + self.URL, + on_user_code=lambda *a: None, + sleep=lambda s: sleeps.append(s), + host="testhost", + ) + # Three sleeps: pending, 502, 502 — final 200 commits. + self.assertEqual(sleeps, [1, 1, 1]) + self.assertEqual(setup.api_token, "pcy_NMGRL_xyz") + + def test_persistent_5xx_eventually_propagates(self): + """If the upstream stays broken past the retry budget, the + ``CloudAPIError`` must propagate so the UI can surface + ``Enrollment failed`` rather than spinning forever.""" + # Build a side_effect long enough to trip the budget: 1 start + # + (RETRY_LIMIT + 2) consecutive 502s. + budget = workstation_setup._POLL_TRANSIENT_RETRY_LIMIT + responses = [_resp(201, _START_BODY)] + [ + _resp(502, {"detail": "forgejo upstream error"}) for _ in range(budget + 2) + ] + with ( + patch.object(api_client.requests, "post") as post, + patch.object(workstation_setup, "keyring_set_token", return_value=True), + ): + post.side_effect = responses + with self.assertRaises(api_client.CloudAPIError): + workstation_setup.WorkstationSetup.from_device_code( + self.URL, + on_user_code=lambda *a: None, + sleep=lambda s: None, + host="testhost", + ) + def test_empty_api_base_url_aborts_before_any_io(self): with patch.object(api_client.requests, "post") as post: with self.assertRaises(workstation_setup.WorkstationSetupError): From 076c32b3e6a531e60f7415596bbc4c0012b1788a Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 10 May 2026 11:21:36 -0600 Subject: [PATCH 2/2] diag(cloud): log full CloudAPIError message in enrollment worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry path inside ``from_device_code`` already logs the full exception, but the outer worker only emitted ``type(exc).__name__`` per the security-review precedent for ``WorkstationSetupError`` — which can carry a keyring-recovery token in its message. ``CloudAPIError`` does NOT carry token material (no ``api_token`` is ever stuffed into a CloudAPIError; it only holds the upstream HTTP status + truncated response body). Splitting the except branches lets the worker log the full ``CloudAPIError`` message while keeping the ``WorkstationSetupError`` path type-only for safety. Useful for diagnosing transient backend failures like the 2026-05-10 17:03 UTC Forgejo upstream blip where workstation logs only said ``CloudAPIError`` with no detail. Co-Authored-By: Claude Opus 4.7 (1M context) --- pychron/cloud/tasks/preferences.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pychron/cloud/tasks/preferences.py b/pychron/cloud/tasks/preferences.py index e90bc19f9..5c67e2d09 100644 --- a/pychron/cloud/tasks/preferences.py +++ b/pychron/cloud/tasks/preferences.py @@ -291,7 +291,21 @@ def _enrollment_worker(self): ) GUI.invoke_later(self._apply_keyring_recovery, exc.lab_name, exc.api_token) return - except (CloudAPIError, WorkstationSetupError) as exc: + except CloudAPIError as exc: + # CloudAPIError carries upstream HTTP status + response body + # (e.g. ``device-code poll returned HTTP 502: forgejo upstream + # error: ...``). Safe to log in full — no plaintext bearer + # tokens cross this surface (those are stripped from + # ``DeviceCodePollSuccess.raw`` and live on + # ``KeyringWriteFailedError.api_token``, which is caught + # earlier). + logger.warning("device-code enrollment failed: %s", exc) + GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red") + return + except WorkstationSetupError as exc: + # ``WorkstationSetupError`` subclasses (esp. + # ``KeyringWriteFailedError`` — already caught above) may + # carry a token in ``str(exc)``; log only the type name. logger.warning("device-code enrollment failed: %s", type(exc).__name__) GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red") return