diff --git a/.changeset/quiet-foxes-skip-handshake.md b/.changeset/quiet-foxes-skip-handshake.md new file mode 100644 index 00000000..4c2fe7a4 --- /dev/null +++ b/.changeset/quiet-foxes-skip-handshake.md @@ -0,0 +1,5 @@ +--- +"partyserver": patch +--- + +Don't reciprocate the WebSocket close handshake when the runtime delivers a reserved close code (`1005`, `1006`, `1015`). These codes are synthesized by the runtime when the peer didn't actually send a Close frame — there is no handshake to complete. The earlier `0.5.4` change (`#393`) normalized these to `1000` and tried to send a reciprocating Close frame anyway; in cross-isolate transports (notably WebSocket pairs that flow back through Durable Object RPC, e.g. Cloudflare Agents sub-agents) the reciprocation would succeed synchronously but schedule an outbound write on a transport whose peer was already gone. The runtime then rejected that write asynchronously with `Network connection lost`, escaping `closeQuietly`'s synchronous `try`/`catch` and surfacing as an unhandled promise rejection in tests and production logs. The fix is to skip the reciprocation for reserved codes — there's nothing to acknowledge to a peer that didn't speak. The narrow user-visible behavior change: a client that calls `ws.close()` with no code on compat dates `< 2026-04-07` (no auto-reply) will now observe a non-clean close instead of a clean `1000` close, because the framework no longer fabricates a reciprocation. Clients that pass an explicit close code are unaffected. diff --git a/package-lock.json b/package-lock.json index 29738fd2..16a6660f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11380,7 +11380,7 @@ "devDependencies": { "@cloudflare/workers-types": "^4.20260424.1", "hono": "^4.12.15", - "partyserver": "^0.5.3" + "partyserver": "^0.5.4" }, "peerDependencies": { "@cloudflare/workers-types": "^4.20260424.1", @@ -11519,7 +11519,7 @@ "license": "ISC", "devDependencies": { "@cloudflare/workers-types": "^4.20260424.1", - "partyserver": "^0.5.3", + "partyserver": "^0.5.4", "partysocket": "^1.1.18" }, "peerDependencies": { @@ -11538,7 +11538,7 @@ "devDependencies": { "@cloudflare/workers-types": "^4.20260424.1", "partyfn": "^0.1.0", - "partyserver": "^0.5.3", + "partyserver": "^0.5.4", "partysocket": "^1.1.18" }, "peerDependencies": { @@ -11581,7 +11581,7 @@ "license": "ISC", "dependencies": { "cron-parser": "^5.5.0", - "partyserver": "^0.5.3" + "partyserver": "^0.5.4" } }, "packages/y-partyserver": { @@ -11597,7 +11597,7 @@ "@cloudflare/workers-types": "^4.20260424.1", "@types/lodash.debounce": "^4.0.9", "@types/node": "25.6.0", - "partyserver": "^0.5.3", + "partyserver": "^0.5.4", "ws": "^8.20.0", "yjs": "^13.6.30" }, diff --git a/packages/hono-party/package.json b/packages/hono-party/package.json index f0d87b4c..f28f268c 100644 --- a/packages/hono-party/package.json +++ b/packages/hono-party/package.json @@ -37,6 +37,6 @@ "devDependencies": { "@cloudflare/workers-types": "^4.20260424.1", "hono": "^4.12.15", - "partyserver": "^0.5.3" + "partyserver": "^0.5.4" } } diff --git a/packages/partyserver/CHANGELOG.md b/packages/partyserver/CHANGELOG.md index 41498fa2..3aedfeb9 100644 --- a/packages/partyserver/CHANGELOG.md +++ b/packages/partyserver/CHANGELOG.md @@ -19,7 +19,6 @@ The fix is at the call site, not in PartyServer: pass `id: someBoundDONamespace.idFromName(facetName)` to `ctx.facets.get(...)`. The facet then gets its own native `ctx.id.name === facetName` and PartyServer's `name` getter does the right thing automatically. No `setName()` is required, no `__ps_name` storage record is written, and cold-wake recovery happens for free because the factory re-runs and `idFromName` is deterministic. This release adds: - - **A "Using PartyServer with Durable Object Facets" section in the README** that walks through the recommended pattern with a code example, calls out the implicit-id footgun explicitly, and documents that plain-string `id` values are not a substitute for `idFromName(facetName)` (workerd treats string ids as `idFromString`-like, so the resulting facet has no `ctx.id.name`). - **`setName()` docstring updated** to clarify that facets are NOT a `setName()` use case — point to the explicit-`id` pattern instead. The original `setName()` `ctx.id.name` mismatch throw is preserved as a typo guard for the `idFromName` happy path. - **End-to-end facet test coverage** against the real workerd `ctx.facets.get(...)` API. A `FacetParent` / `FacetChild` fixture exercises both the implicit-id path (pinning the runtime contract that `this.name` returns the parent's name in that flow — i.e., behavior-as-documentation so framework authors are unsurprised) and the explicit-id path (recommended; verifies that all reasonable id-construction strategies work and that cold wake recovers without any storage record). Plain-string `id` is also tested; the test asserts it does NOT carry a name, pinning the contract so callers don't get tempted by the type signature. @@ -48,7 +47,6 @@ ``` Backward compatible: - - For DOs addressed via `idFromName()` / `getByName()` (the happy path), `setName()` continues to NOT write storage — `ctx.id.name` is the source of truth and `setName()` is just a no-op-plus-onStart. - The pre-existing direct-storage-write pattern keeps working — the storage write becomes idempotent with what `setName()` would do. @@ -63,7 +61,6 @@ 0.5.0 moved the legacy storage hydrate into `alarm()` only, breaking Cloudflare Agents facets and any other framework that writes `__ps_name` directly before calling `__unsafe_ensureInitialized()`. Facet DOs are spawned via `ctx.facets.get(...)` rather than `idFromName()` and therefore have `ctx.id.name === undefined`; they relied on PartyServer reading the storage record back to populate `this.name` before `onStart()`. Changes: - - Move the legacy `__ps_name` hydrate from `alarm()` into `#ensureInitialized()`, still gated on `!ctx.id.name && !#_name` so it costs nothing on the happy path (normal `idFromName()`/`getByName()` DOs skip the storage read entirely). - `Server.fetch()` now delegates to `#ensureInitialized()` for the hydrate instead of doing its own. The `x-partykit-room` header fallback remains as a last resort when neither `ctx.id.name` nor a legacy storage record is available. - `Server.alarm()` is simplified — it no longer needs its own hydrate call since `#ensureInitialized()` handles it. @@ -78,7 +75,6 @@ Durable Objects now expose `ctx.id.name` on every entry point (constructor, fetch, alarm, hibernating websocket handlers) when the DO is addressed via `idFromName()`/`getByName()`. PartyServer now uses this as the primary source of `this.name`, which simplifies routing, eliminates storage writes, and makes `this.name` available inside the constructor. Changes in `partyserver`: - - `this.name` resolves from `this.ctx.id.name`. The apologetic `workerd#2240` error message is gone. - `this.name` is now available **inside the constructor** and from class field initializers, not just after `setName()`/`fetch()` has run. - `routePartykitRequest` no longer issues a `setName()`/`_initAndFetch()` RPC before `fetch()`. The WebSocket path goes from 2 RPCs to 1; the HTTP path remains 1 RPC. Props, when supplied, are delivered to the DO via the `x-partykit-props` request header, set after `onBeforeConnect`/`onBeforeRequest` hooks run. @@ -90,7 +86,6 @@ - When reading `this.name` throws, it is because `ctx.id.name` is undefined and no legacy fallback has populated the name: the DO was addressed via `idFromString()` or `newUniqueId()` (both unsupported), the runtime is too old to expose `ctx.id.name`, or a pre-2026-03-15 alarm fired before the legacy storage fallback ran. Changes in all affected packages (`partyserver`, `partysub`, `partysync`, `y-partyserver`, `hono-party`): - - `@cloudflare/workers-types` peer dependency bumped from `^4.20240729.0` to `^4.20260424.1`. The old range predates `ctx.id.name` in the type surface. Not supported: addressing PartyServer DOs via `idFromString()` or `newUniqueId()`. These paths return `ctx.id.name === undefined` inside the DO and will surface as a clear error from `this.name`. PartyServer has always assumed name-based addressing via `getServerByName` / `routePartykitRequest`; this release makes that assumption explicit. @@ -411,14 +406,12 @@ ### Patch Changes - [`528adea`](https://github.com/threepointone/partyserver/commit/528adeaced6dce6e888d2f54cc75c3569bf2c277) Thanks [@threepointone](https://github.com/threepointone)! - some fixes and tweaks - - getServerByName was throwing on all requests - `Env` is now an optional arg when defining `Server` - `y-partyserver/provider` can now take an optional `prefix` arg to use a custom url to connect - `routePartyKitRequest`/`getServerByName` now accepts `jurisdiction` bonus: - - added a bunch of fixtures - added stubs for docs diff --git a/packages/partyserver/src/index.ts b/packages/partyserver/src/index.ts index 6d022b0d..ab17cd43 100644 --- a/packages/partyserver/src/index.ts +++ b/packages/partyserver/src/index.ts @@ -23,31 +23,50 @@ export type WSMessage = ArrayBuffer | ArrayBufferView | string; const NAME_STORAGE_KEY = "__ps_name"; /** - * Reserved WebSocket close codes that cannot be sent in a Close frame. - * - 1005 (NoStatusReceived) — stand-in for "no code in close frame". - * - 1006 (AbnormalClosure) — synthesized for connections that drop - * without a Close frame. - * - 1015 (TLSHandshake) — TLS failure indicator. + * Reserved WebSocket close codes the runtime synthesizes when there + * was no real Close frame from the peer: + * - 1005 (NoStatusReceived) — peer's frame had no status code. + * - 1006 (AbnormalClosure) — peer dropped the underlying transport + * without sending a Close frame at all. + * - 1015 (TLSHandshake) — TLS failure during connection setup. * - * If we observe one of these in `webSocketClose`, normalize it to 1000 - * before reciprocating, otherwise `ws.close(code, reason)` will throw. + * These cannot legally appear in an outgoing Close frame, and — more + * importantly for our reciprocation path — there is no peer left to + * receive a reciprocating Close frame. Trying to send one anyway can + * succeed synchronously but fail asynchronously inside the runtime + * with "WebSocket peer disconnected" / "Network connection lost", + * which escapes a synchronous try/catch and surfaces as an unhandled + * promise rejection. */ -function normalizeCloseCode(code: number): number { - if (code === 1005 || code === 1006 || code === 1015) return 1000; - return code; +function isReservedCloseCode(code: number): boolean { + return code === 1005 || code === 1006 || code === 1015; } /** * Reciprocate a peer-initiated Close frame to complete the handshake. * - * Best-effort: swallows errors from invalid codes, oversize reasons, or - * sockets that have already been closed by user code. Used by both the - * hibernating and non-hibernating close handlers to ensure the close - * handshake always completes. + * Best-effort: swallows synchronous errors from invalid codes, + * oversize reasons, or sockets that have already been closed by user + * code. Skips the reciprocation entirely when the peer didn't + * actually send a Close frame (reserved codes 1005/1006/1015) — in + * those cases the underlying transport is already gone and writing + * to it would fail asynchronously, which we can't catch here. + * + * Used by both the hibernating and non-hibernating close handlers to + * ensure the close handshake always completes when there is one to + * complete. */ function closeQuietly(ws: WebSocket, code: number, reason: string): void { + // No real Close frame from the peer → nothing to reciprocate. + // Calling `ws.close(...)` here would synchronously succeed but + // schedule an outbound write on a dead transport, which the runtime + // would later reject with "Network connection lost". That rejection + // can't be observed from here (it's not thrown synchronously and + // ws.close() doesn't return a Promise to attach a `.catch` to), so + // it would surface as an unhandled rejection. + if (isReservedCloseCode(code)) return; try { - ws.close(normalizeCloseCode(code), reason); + ws.close(code, reason); } catch { // Reasons we end up here: // - the socket was already closed (user called `connection.close()` diff --git a/packages/partyserver/src/tests/index.test.ts b/packages/partyserver/src/tests/index.test.ts index dec99c9e..94e7ed98 100644 --- a/packages/partyserver/src/tests/index.test.ts +++ b/packages/partyserver/src/tests/index.test.ts @@ -1652,6 +1652,65 @@ describe("Close handshake (#389)", () => { expect([1000, 4000]).toContain(event.code); }); + it("skips reciprocation when the peer didn't send a real Close frame (reserved codes)", async () => { + // Regression for the followup to #393: when the runtime calls + // `webSocketClose` with a reserved synthetic code (1005 / 1006 + // / 1015), the peer didn't actually send a Close frame — these + // codes can only be SYNTHESIZED locally. Calling `ws.close(...)` + // to "reciprocate" anyway succeeds synchronously but schedules + // an outbound write whose target may already be gone (notably + // when the WebSocket pair is tunneled across Durable Object + // RPC boundaries). The runtime later rejects that write with + // `Network connection lost`, which escapes `closeQuietly`'s + // synchronous try/catch and surfaces as an unhandled rejection. + // + // The fix is to recognize the reserved-code shape and skip the + // reciprocation entirely. There is nothing to acknowledge. + // + // What we verify here: + // 1. `onClose` IS still invoked with the reserved code, so + // user-side cleanup runs as expected. + // 2. The test file completes without any unhandled rejection. + // Without the fix, an integration scenario that puts the + // server-side socket on a different isolate from the + // client (e.g. a Cloudflare Agents facet) would emit + // `Network connection lost` here as a vitest "Errors 1" + // entry alongside the otherwise-passing test. + // + // We cannot easily reproduce the cross-isolate transport + // failure inside this in-process test runner, but we CAN + // reliably synthesize a code-1005 arrival on the server by + // having the client send a Close frame with no code (`ws.close()` + // with no args). That exercises the same `closeQuietly` + // branch without depending on a particular runtime topology. + const ws = await openAndHandshake( + "http://example.com/parties/close-handshake-hibernating/reserved-room" + ); + ws.close(); // no code → server receives webSocketClose with code=1005 + + // We don't await `waitForClose(ws)` — with reciprocation skipped + // the client may not see a clean close on older compat dates + // (the runtime's auto-reply flag isn't active until 2026-04-07). + // What matters is that the server's `onClose` ran with the + // reserved code and no unhandled rejection escaped. + await new Promise((resolve) => setTimeout(resolve, 200)); + + const ctx = createExecutionContext(); + const probe = await worker.fetch( + new Request( + "http://example.com/parties/close-handshake-hibernating/reserved-room" + ), + env, + ctx + ); + const body = (await probe.json()) as { + lastClose: { code: number; wasClean: boolean } | null; + }; + + expect(body.lastClose).not.toBeNull(); + expect(body.lastClose!.code).toBe(1005); + }); + it("normalizes reserved close codes (1005, 1006, 1015) when reciprocating", async () => { // The runtime never delivers a 1005/1006 close *frame* to // webSocketClose (these are synthetic, used only when there is diff --git a/packages/partysub/package.json b/packages/partysub/package.json index d13ba3c7..3c6492b6 100644 --- a/packages/partysub/package.json +++ b/packages/partysub/package.json @@ -46,7 +46,7 @@ }, "devDependencies": { "@cloudflare/workers-types": "^4.20260424.1", - "partyserver": "^0.5.3", + "partyserver": "^0.5.4", "partysocket": "^1.1.18" } } diff --git a/packages/partysync/package.json b/packages/partysync/package.json index b9182d20..67f56625 100644 --- a/packages/partysync/package.json +++ b/packages/partysync/package.json @@ -51,7 +51,7 @@ "devDependencies": { "@cloudflare/workers-types": "^4.20260424.1", "partyfn": "^0.1.0", - "partyserver": "^0.5.3", + "partyserver": "^0.5.4", "partysocket": "^1.1.18" }, "peerDependencies": { diff --git a/packages/partywhen/package.json b/packages/partywhen/package.json index 7d7bb17d..123c454f 100644 --- a/packages/partywhen/package.json +++ b/packages/partywhen/package.json @@ -29,6 +29,6 @@ "description": "A library for scheduling and running tasks in Cloudflare Workers", "dependencies": { "cron-parser": "^5.5.0", - "partyserver": "^0.5.3" + "partyserver": "^0.5.4" } } diff --git a/packages/y-partyserver/package.json b/packages/y-partyserver/package.json index 5bc81e63..31a98522 100644 --- a/packages/y-partyserver/package.json +++ b/packages/y-partyserver/package.json @@ -65,7 +65,7 @@ "@cloudflare/workers-types": "^4.20260424.1", "@types/lodash.debounce": "^4.0.9", "@types/node": "25.6.0", - "partyserver": "^0.5.3", + "partyserver": "^0.5.4", "ws": "^8.20.0", "yjs": "^13.6.30" },