diff --git a/pychron/cloud/tasks/preferences.py b/pychron/cloud/tasks/preferences.py index e90bc19f9..5c67e2d09 100644 --- a/pychron/cloud/tasks/preferences.py +++ b/pychron/cloud/tasks/preferences.py @@ -291,7 +291,21 @@ def _enrollment_worker(self): ) GUI.invoke_later(self._apply_keyring_recovery, exc.lab_name, exc.api_token) return - except (CloudAPIError, WorkstationSetupError) as exc: + except CloudAPIError as exc: + # CloudAPIError carries upstream HTTP status + response body + # (e.g. ``device-code poll returned HTTP 502: forgejo upstream + # error: ...``). Safe to log in full — no plaintext bearer + # tokens cross this surface (those are stripped from + # ``DeviceCodePollSuccess.raw`` and live on + # ``KeyringWriteFailedError.api_token``, which is caught + # earlier). + logger.warning("device-code enrollment failed: %s", exc) + GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red") + return + except WorkstationSetupError as exc: + # ``WorkstationSetupError`` subclasses (esp. + # ``KeyringWriteFailedError`` — already caught above) may + # carry a token in ``str(exc)``; log only the type name. logger.warning("device-code enrollment failed: %s", type(exc).__name__) GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red") return diff --git a/pychron/cloud/workstation_setup.py b/pychron/cloud/workstation_setup.py index a06e875d2..aa05a2541 100644 --- a/pychron/cloud/workstation_setup.py +++ b/pychron/cloud/workstation_setup.py @@ -53,6 +53,7 @@ CloudDeviceCodeExpired, CloudDeviceCodePending, CloudFingerprintRejected, + CloudNetworkError, poll_device_code, register_ssh_key, revoke_workstation_token, @@ -82,6 +83,15 @@ logger = logging.getLogger(__name__) +# Retry budget for transient poll failures. The server-side mint +# (Forgejo bot create + ssh-key add) is a multi-step network +# operation that occasionally trips on upstream timeouts. The +# device-code row stays approved + unconsumed when the mint rolls +# back, so the workstation can retry without re-bothering the admin. +# After this many CONSECUTIVE transient failures we give up and +# surface the error so the operator knows something is broken. +_POLL_TRANSIENT_RETRY_LIMIT = 6 + class WorkstationSetupError(Exception): """Raised when onboarding cannot complete.""" @@ -184,6 +194,7 @@ def from_device_code( interval = max(1, int(start.interval_seconds or 5)) cancel = should_cancel or (lambda: False) + transient_failures = 0 while True: if cancel(): @@ -191,6 +202,7 @@ def from_device_code( try: success = poll_device_code(api_base_url, start.device_code) except CloudDeviceCodePending: + transient_failures = 0 sleep(interval) continue except CloudDeviceCodeDenied: @@ -199,6 +211,29 @@ def from_device_code( raise except CloudFingerprintRejected: raise + except (CloudNetworkError, CloudAPIError) as exc: + # 5xx / network blip during the mint rolls back the + # device_code on the server (still approved + unconsumed), + # so re-polling is the right move. Retry with the same + # cadence; bail after ``_POLL_TRANSIENT_RETRY_LIMIT`` + # consecutive failures so a persistent outage doesn't + # spin forever. + transient_failures += 1 + if transient_failures > _POLL_TRANSIENT_RETRY_LIMIT: + logger.warning( + "device-code poll: %d consecutive transient failures, " "giving up: %s", + transient_failures, + exc, + ) + raise + logger.info( + "device-code poll transient failure %d/%d: %s", + transient_failures, + _POLL_TRANSIENT_RETRY_LIMIT, + exc, + ) + sleep(interval) + continue break if not success.api_token: diff --git a/test/cloud/test_device_code_setup.py b/test/cloud/test_device_code_setup.py index 3a21789c0..33777191f 100644 --- a/test/cloud/test_device_code_setup.py +++ b/test/cloud/test_device_code_setup.py @@ -235,6 +235,57 @@ def test_keyring_failure_raises_typed_error_token_not_in_str(self): # catching the broader type. self.assertIsInstance(cm.exception, workstation_setup.WorkstationSetupError) + def test_transient_502_during_poll_is_retried(self): + """A 502 (Forgejo upstream blip during mint) leaves the + device-code row approved + unconsumed on the server, so the + workstation should retry instead of giving up. After a few + retries the next poll succeeds and the enrollment completes + normally.""" + with ( + patch.object(api_client.requests, "post") as post, + patch.object(workstation_setup, "keyring_set_token", return_value=True), + ): + post.side_effect = [ + _resp(201, _START_BODY), + _resp(425, {}), + _resp(502, {"detail": "forgejo upstream error: ..."}), + _resp(502, {"detail": "forgejo upstream error: ..."}), + _resp(200, _poll_body()), + ] + sleeps = [] + setup = workstation_setup.WorkstationSetup.from_device_code( + self.URL, + on_user_code=lambda *a: None, + sleep=lambda s: sleeps.append(s), + host="testhost", + ) + # Three sleeps: pending, 502, 502 — final 200 commits. + self.assertEqual(sleeps, [1, 1, 1]) + self.assertEqual(setup.api_token, "pcy_NMGRL_xyz") + + def test_persistent_5xx_eventually_propagates(self): + """If the upstream stays broken past the retry budget, the + ``CloudAPIError`` must propagate so the UI can surface + ``Enrollment failed`` rather than spinning forever.""" + # Build a side_effect long enough to trip the budget: 1 start + # + (RETRY_LIMIT + 2) consecutive 502s. + budget = workstation_setup._POLL_TRANSIENT_RETRY_LIMIT + responses = [_resp(201, _START_BODY)] + [ + _resp(502, {"detail": "forgejo upstream error"}) for _ in range(budget + 2) + ] + with ( + patch.object(api_client.requests, "post") as post, + patch.object(workstation_setup, "keyring_set_token", return_value=True), + ): + post.side_effect = responses + with self.assertRaises(api_client.CloudAPIError): + workstation_setup.WorkstationSetup.from_device_code( + self.URL, + on_user_code=lambda *a: None, + sleep=lambda s: None, + host="testhost", + ) + def test_empty_api_base_url_aborts_before_any_io(self): with patch.object(api_client.requests, "post") as post: with self.assertRaises(workstation_setup.WorkstationSetupError):