Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion pychron/cloud/tasks/preferences.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,21 @@ def _enrollment_worker(self):
)
GUI.invoke_later(self._apply_keyring_recovery, exc.lab_name, exc.api_token)
return
except (CloudAPIError, WorkstationSetupError) as exc:
except CloudAPIError as exc:
# CloudAPIError carries upstream HTTP status + response body
# (e.g. ``device-code poll returned HTTP 502: forgejo upstream
# error: ...``). Safe to log in full — no plaintext bearer
# tokens cross this surface (those are stripped from
# ``DeviceCodePollSuccess.raw`` and live on
# ``KeyringWriteFailedError.api_token``, which is caught
# earlier).
logger.warning("device-code enrollment failed: %s", exc)
GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red")
return
except WorkstationSetupError as exc:
# ``WorkstationSetupError`` subclasses (esp.
# ``KeyringWriteFailedError`` — already caught above) may
# carry a token in ``str(exc)``; log only the type name.
logger.warning("device-code enrollment failed: %s", type(exc).__name__)
GUI.invoke_later(self._apply_enrollment_terminal, "Enrollment failed", "red")
return
Expand Down
35 changes: 35 additions & 0 deletions pychron/cloud/workstation_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
CloudDeviceCodeExpired,
CloudDeviceCodePending,
CloudFingerprintRejected,
CloudNetworkError,
poll_device_code,
register_ssh_key,
revoke_workstation_token,
Expand Down Expand Up @@ -82,6 +83,15 @@

logger = logging.getLogger(__name__)

# Retry budget for transient poll failures. The server-side mint
# (Forgejo bot create + ssh-key add) is a multi-step network
# operation that occasionally trips on upstream timeouts. The
# device-code row stays approved + unconsumed when the mint rolls
# back, so the workstation can retry without re-bothering the admin.
# After this many CONSECUTIVE transient failures we give up and
# surface the error so the operator knows something is broken.
_POLL_TRANSIENT_RETRY_LIMIT = 6


class WorkstationSetupError(Exception):
"""Raised when onboarding cannot complete."""
Expand Down Expand Up @@ -184,13 +194,15 @@ def from_device_code(

interval = max(1, int(start.interval_seconds or 5))
cancel = should_cancel or (lambda: False)
transient_failures = 0

while True:
if cancel():
raise DeviceEnrollmentCancelled("enrollment cancelled by user")
try:
success = poll_device_code(api_base_url, start.device_code)
except CloudDeviceCodePending:
transient_failures = 0
sleep(interval)
continue
except CloudDeviceCodeDenied:
Expand All @@ -199,6 +211,29 @@ def from_device_code(
raise
except CloudFingerprintRejected:
raise
except (CloudNetworkError, CloudAPIError) as exc:
# 5xx / network blip during the mint rolls back the
# device_code on the server (still approved + unconsumed),
# so re-polling is the right move. Retry with the same
# cadence; bail after ``_POLL_TRANSIENT_RETRY_LIMIT``
# consecutive failures so a persistent outage doesn't
# spin forever.
transient_failures += 1
if transient_failures > _POLL_TRANSIENT_RETRY_LIMIT:
logger.warning(
"device-code poll: %d consecutive transient failures, " "giving up: %s",
transient_failures,
exc,
)
raise
logger.info(
"device-code poll transient failure %d/%d: %s",
transient_failures,
_POLL_TRANSIENT_RETRY_LIMIT,
exc,
)
sleep(interval)
continue
break

if not success.api_token:
Expand Down
51 changes: 51 additions & 0 deletions test/cloud/test_device_code_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,57 @@ def test_keyring_failure_raises_typed_error_token_not_in_str(self):
# catching the broader type.
self.assertIsInstance(cm.exception, workstation_setup.WorkstationSetupError)

def test_transient_502_during_poll_is_retried(self):
"""A 502 (Forgejo upstream blip during mint) leaves the
device-code row approved + unconsumed on the server, so the
workstation should retry instead of giving up. After a few
retries the next poll succeeds and the enrollment completes
normally."""
with (
patch.object(api_client.requests, "post") as post,
patch.object(workstation_setup, "keyring_set_token", return_value=True),
):
post.side_effect = [
_resp(201, _START_BODY),
_resp(425, {}),
_resp(502, {"detail": "forgejo upstream error: ..."}),
_resp(502, {"detail": "forgejo upstream error: ..."}),
_resp(200, _poll_body()),
]
sleeps = []
setup = workstation_setup.WorkstationSetup.from_device_code(
self.URL,
on_user_code=lambda *a: None,
sleep=lambda s: sleeps.append(s),
host="testhost",
)
# Three sleeps: pending, 502, 502 — final 200 commits.
self.assertEqual(sleeps, [1, 1, 1])
self.assertEqual(setup.api_token, "pcy_NMGRL_xyz")

def test_persistent_5xx_eventually_propagates(self):
"""If the upstream stays broken past the retry budget, the
``CloudAPIError`` must propagate so the UI can surface
``Enrollment failed`` rather than spinning forever."""
# Build a side_effect long enough to trip the budget: 1 start
# + (RETRY_LIMIT + 2) consecutive 502s.
budget = workstation_setup._POLL_TRANSIENT_RETRY_LIMIT
responses = [_resp(201, _START_BODY)] + [
_resp(502, {"detail": "forgejo upstream error"}) for _ in range(budget + 2)
]
with (
patch.object(api_client.requests, "post") as post,
patch.object(workstation_setup, "keyring_set_token", return_value=True),
):
post.side_effect = responses
with self.assertRaises(api_client.CloudAPIError):
workstation_setup.WorkstationSetup.from_device_code(
self.URL,
on_user_code=lambda *a: None,
sleep=lambda s: None,
host="testhost",
)

def test_empty_api_base_url_aborts_before_any_io(self):
with patch.object(api_client.requests, "post") as post:
with self.assertRaises(workstation_setup.WorkstationSetupError):
Expand Down