From 948f1300d1c366d226c5286b32a779d0809d0068 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 27 Apr 2026 11:39:49 +0000 Subject: [PATCH 01/22] feat(alerting): add configurable PM results list for alerting agent + enable UX --- src/agents/definitions/alerting.yaml | 6 +- src/agents/shared/promptContext.ts | 10 ++ src/backends/secretOrchestrator.ts | 15 +++ src/sentry/integration.ts | 8 ++ .../unit/agents/shared/promptContext.test.ts | 81 ++++++++++++ tests/unit/sentry/integration.test.ts | 40 ++++++ .../projects/integration-alerting-tab.tsx | 122 +++++++++++++++++- .../components/projects/integration-form.tsx | 6 +- 8 files changed, 283 insertions(+), 5 deletions(-) diff --git a/src/agents/definitions/alerting.yaml b/src/agents/definitions/alerting.yaml index 7cc638aa..9277e7f9 100644 --- a/src/agents/definitions/alerting.yaml +++ b/src/agents/definitions/alerting.yaml @@ -58,9 +58,12 @@ prompts: 2. Read the relevant source files to understand context and confirm the root cause 3. Summarize: what failed, why, and which code path is responsible <% if (it.backlogListId) { %> - 4. Create a bug fix work item in the backlog (list/status ID: <%= it.backlogListId %>) with: + 4. Create a bug fix investigation work item in the backlog (list/status ID: <%= it.backlogListId %>) with: - Title: short, actionable description (e.g. "Fix: NullPointerException in PaymentService.charge") - Description: root cause, error details, affected file/function, link to the alert issue + <% } else if (it.workItemId) { %> + 4. Post a comment on the current work item (<%= it.workItemId %>) summarising the investigation: + root cause, affected file/function, and recommended fix approach <% } %> 5. Call Finish when done @@ -69,3 +72,4 @@ hint: | Focus on frames from application code (not third-party library frames). Check the event timeline/breadcrumbs for the action or request that preceded the failure. Keep the bug work item description concise and actionable. + If no backlog list is configured, post investigation findings as a comment on the triggering work item. diff --git a/src/agents/shared/promptContext.ts b/src/agents/shared/promptContext.ts index c27d7a79..1b4dcc10 100644 --- a/src/agents/shared/promptContext.ts +++ b/src/agents/shared/promptContext.ts @@ -51,6 +51,10 @@ function getPromptTerminology(pmType: string | undefined) { * Shared by the llmist agent lifecycle (agents/base.ts) and the adapter * (backends/adapter.ts) so both backends use consistent prompt context * building logic including PM-type normalization and work item noun i18n. + * + * @param alertingResultsContainerId - Optional PM container ID from Sentry integration config. + * Used as a fallback `backlogListId` when no PM backlog is configured on the project. + * Populated by `secretOrchestrator` for alerting agent runs. */ export function buildPromptContext( workItemId: string | undefined, @@ -64,17 +68,23 @@ export function buildPromptContext( originalWorkItemUrl: string; detectedAgentType: string; }, + alertingResultsContainerId?: string, ): PromptContext { const pmProvider = getPMProviderOrNull(); const listIds = getListIds(project); const terminology = getPromptTerminology(pmProvider?.type); + // Fall back to the Sentry-configured results container when no PM backlog is set. + const backlogListId = + listIds.backlogListId ?? (alertingResultsContainerId ? alertingResultsContainerId : undefined); + return { workItemId, workItemUrl: workItemId && pmProvider ? pmProvider.getWorkItemUrl(workItemId) : undefined, projectId: project.id, baseBranch: project.baseBranch, ...listIds, + backlogListId, pmType: pmProvider?.type, ...terminology, maxInFlightItems: project.maxInFlightItems ?? 1, diff --git a/src/backends/secretOrchestrator.ts b/src/backends/secretOrchestrator.ts index 405a010d..c2f39442 100644 --- a/src/backends/secretOrchestrator.ts +++ b/src/backends/secretOrchestrator.ts @@ -9,6 +9,7 @@ import type { createAgentLogger } from '../agents/utils/logging.js'; import { mergeEngineSettings } from '../config/engineSettings.js'; import { loadPartials } from '../db/repositories/partialsRepository.js'; import { withGitHubToken } from '../github/client.js'; +import { getSentryIntegrationConfig } from '../sentry/integration.js'; import type { AgentInput, CascadeConfig, ProjectConfig } from '../types/index.js'; import { getDashboardUrl } from '../utils/runLink.js'; import { createNativeToolRuntimeArtifacts } from './nativeToolRuntime.js'; @@ -59,11 +60,25 @@ export async function buildExecutionPlan( } : undefined; + // For alerting agents, look up Sentry `resultsContainerId` as a fallback + // backlogListId when no PM backlog status/list is configured on the project. + let alertingResultsContainerId: string | undefined; + if (agentType === 'alerting') { + try { + const sentryConfig = await getSentryIntegrationConfig(project.id); + alertingResultsContainerId = sentryConfig?.resultsContainerId; + } catch { + // Non-fatal — proceed without the fallback container + } + } + const promptContext: PromptContext = buildPromptContext( workItemId, project, input.triggerType, prContext, + undefined, + alertingResultsContainerId, ); // Load DB partials for template include resolution diff --git a/src/sentry/integration.ts b/src/sentry/integration.ts index 54acdb18..ddbc1459 100644 --- a/src/sentry/integration.ts +++ b/src/sentry/integration.ts @@ -14,6 +14,11 @@ import { getIntegrationByProjectAndCategory } from '../db/repositories/integrati export interface SentryIntegrationConfig { /** Sentry organization slug (e.g. "my-company") */ organizationSlug: string; + /** + * PM container ID where the alerting agent creates investigation work items. + * Maps to `backlogListId` in the prompt context when no PM backlog is configured. + */ + resultsContainerId?: string; } // ============================================================================ @@ -35,5 +40,8 @@ export async function getSentryIntegrationConfig( return { organizationSlug: config.organizationSlug, + ...(typeof config.resultsContainerId === 'string' + ? { resultsContainerId: config.resultsContainerId } + : {}), }; } diff --git a/tests/unit/agents/shared/promptContext.test.ts b/tests/unit/agents/shared/promptContext.test.ts index a7346125..5016b4fc 100644 --- a/tests/unit/agents/shared/promptContext.test.ts +++ b/tests/unit/agents/shared/promptContext.test.ts @@ -467,6 +467,87 @@ describe('buildPromptContext', () => { }); }); + describe('with alertingResultsContainerId fallback', () => { + beforeEach(() => { + const mockProvider = createMockPMProvider(); + mockProvider.type = 'trello'; + mockProvider.getWorkItemUrl = vi.fn((id: string) => `https://trello.com/c/${id}`); + mockGetPMProvider.mockReturnValue(mockProvider); + }); + + it('uses alertingResultsContainerId as backlogListId when PM backlog is not set', () => { + const projectWithoutBacklog = makeProject({ + trello: { + boardId: 'board1', + lists: { + splitting: 'list1', + planning: 'list2', + todo: 'list3', + // no backlog + inProgress: 'list-in-progress', + inReview: 'list-in-review', + merged: 'list-merged', + }, + labels: { readyToProcess: 'label1', processed: 'label2' }, + }, + }); + const ctx = buildPromptContext( + 'card123', + projectWithoutBacklog as never, + undefined, + undefined, + undefined, + 'sentry-container-id', + ); + expect(ctx.backlogListId).toBe('sentry-container-id'); + }); + + it('PM backlogListId takes precedence over alertingResultsContainerId', () => { + const ctx = buildPromptContext( + 'card123', + makeProject() as never, + undefined, + undefined, + undefined, + 'sentry-container-id', + ); + // makeProject has trello.lists.backlog = 'list-backlog' + expect(ctx.backlogListId).toBe('list-backlog'); + }); + + it('alertingResultsContainerId is ignored when it is undefined', () => { + const ctx = buildPromptContext( + 'card123', + makeProject() as never, + undefined, + undefined, + undefined, + undefined, + ); + // still gets backlog from PM config + expect(ctx.backlogListId).toBe('list-backlog'); + }); + + it('backlogListId remains undefined when neither PM backlog nor alertingResultsContainerId is set', () => { + const projectWithoutBacklog = makeProject({ + trello: { + boardId: 'board1', + lists: { + splitting: 'list1', + planning: 'list2', + todo: 'list3', + inProgress: 'list-in-progress', + inReview: 'list-in-review', + merged: 'list-merged', + }, + labels: { readyToProcess: 'label1', processed: 'label2' }, + }, + }); + const ctx = buildPromptContext('card123', projectWithoutBacklog as never); + expect(ctx.backlogListId).toBeUndefined(); + }); + }); + describe('without PM provider (no PM context — e.g. debug agent from dashboard)', () => { beforeEach(() => { mockGetPMProvider.mockReturnValue(null); diff --git a/tests/unit/sentry/integration.test.ts b/tests/unit/sentry/integration.test.ts index 08264b05..aba67428 100644 --- a/tests/unit/sentry/integration.test.ts +++ b/tests/unit/sentry/integration.test.ts @@ -96,6 +96,46 @@ describe('sentry/integration', () => { expect(result).toEqual({ organizationSlug: 'acme-corp' }); }); + it('returns resultsContainerId when present in config', async () => { + mockGetIntegrationByProjectAndCategory.mockResolvedValueOnce({ + id: 'int-3', + provider: 'sentry', + config: { organizationSlug: 'my-org', resultsContainerId: 'list-backlog-123' }, + }); + + const result = await getSentryIntegrationConfig('proj-3'); + + expect(result).toEqual({ + organizationSlug: 'my-org', + resultsContainerId: 'list-backlog-123', + }); + }); + + it('omits resultsContainerId when absent from config', async () => { + mockGetIntegrationByProjectAndCategory.mockResolvedValueOnce({ + id: 'int-4', + provider: 'sentry', + config: { organizationSlug: 'my-org' }, + }); + + const result = await getSentryIntegrationConfig('proj-4'); + + expect(result).toEqual({ organizationSlug: 'my-org' }); + expect(result?.resultsContainerId).toBeUndefined(); + }); + + it('omits resultsContainerId when it is not a string', async () => { + mockGetIntegrationByProjectAndCategory.mockResolvedValueOnce({ + id: 'int-5', + provider: 'sentry', + config: { organizationSlug: 'my-org', resultsContainerId: 42 }, + }); + + const result = await getSentryIntegrationConfig('proj-5'); + + expect(result?.resultsContainerId).toBeUndefined(); + }); + it('queries using projectId and alerting category', async () => { mockGetIntegrationByProjectAndCategory.mockResolvedValueOnce(null); diff --git a/web/src/components/projects/integration-alerting-tab.tsx b/web/src/components/projects/integration-alerting-tab.tsx index 3363f497..257dad6c 100644 --- a/web/src/components/projects/integration-alerting-tab.tsx +++ b/web/src/components/projects/integration-alerting-tab.tsx @@ -2,7 +2,7 @@ * Alerting (Sentry) integration tab component. */ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; -import { Trash2 } from 'lucide-react'; +import { Info, Trash2 } from 'lucide-react'; import { useState } from 'react'; import { CopyButton } from '@/components/ui/copy-button.js'; import { Input } from '@/components/ui/input.js'; @@ -11,6 +11,77 @@ import { API_URL } from '@/lib/api.js'; import { trpc, trpcClient } from '@/lib/trpc.js'; import { ProjectSecretField } from './project-secret-field.js'; +// ============================================================================ +// PM Container Picker +// ============================================================================ + +interface ContainerPickerProps { + projectId: string; + pmProvider: string; + value: string; + onChange: (id: string) => void; +} + +function PMContainerPicker({ projectId, pmProvider, value, onChange }: ContainerPickerProps) { + const containersMutation = useMutation({ + mutationFn: async () => { + return (await trpcClient.pm.discovery.discover.mutate({ + providerId: pmProvider, + capability: 'containers', + args: {}, + projectId, + })) as Array<{ id: string; name: string }>; + }, + }); + + return ( +
+
+ + +
+ {containersMutation.isError && ( +

{containersMutation.error.message}

+ )} +

+ Or enter the ID manually:{' '} + onChange(e.target.value)} + placeholder="container-id" + className="ml-1 inline-block h-6 rounded border border-input bg-background px-2 text-xs" + /> +

+
+ ); +} + // ============================================================================ // Alerting Tab (Sentry) // ============================================================================ @@ -18,15 +89,20 @@ import { ProjectSecretField } from './project-secret-field.js'; interface AlertingTabProps { projectId: string; alertingIntegration?: Record; + /** PM provider slug (e.g. "trello", "jira", "linear") when a PM integration is configured. */ + pmProvider?: string; } -export function AlertingTab({ projectId, alertingIntegration }: AlertingTabProps) { +export function AlertingTab({ projectId, alertingIntegration, pmProvider }: AlertingTabProps) { const queryClient = useQueryClient(); const existingConfig = (alertingIntegration?.config as Record) ?? {}; const [organizationSlug, setOrganizationSlug] = useState( (existingConfig.organizationSlug as string) ?? '', ); + const [resultsContainerId, setResultsContainerId] = useState( + (existingConfig.resultsContainerId as string) ?? '', + ); const [verifyResult, setVerifyResult] = useState<{ id: string; @@ -80,7 +156,10 @@ export function AlertingTab({ projectId, alertingIntegration }: AlertingTabProps projectId, category: 'alerting', provider: 'sentry', - config: { organizationSlug }, + config: { + organizationSlug, + ...(resultsContainerId ? { resultsContainerId } : {}), + }, }); }, onSuccess: () => { @@ -106,6 +185,17 @@ export function AlertingTab({ projectId, alertingIntegration }: AlertingTabProps return (
+ {/* Agent enablement info box */} +
+ +
+ Enable the Alerting Agent — After saving this + integration, go to the Agents tab and enable the{' '} + alerting agent type so Sentry alerts trigger + investigation runs automatically. +
+
+ {/* Organization Slug */}
@@ -123,6 +213,32 @@ export function AlertingTab({ projectId, alertingIntegration }: AlertingTabProps
+ {/* Investigation Results List */} +
+ +

+ The PM list or status where the alerting agent creates investigation work items. Used as + the target container when the agent creates bug fix cards. +

+ {pmProvider ? ( + + ) : ( + setResultsContainerId(e.target.value)} + placeholder="List ID or status name (configure PM integration to use a picker)" + /> + )} +
+ +
+ {/* Credentials */}
diff --git a/web/src/components/projects/integration-form.tsx b/web/src/components/projects/integration-form.tsx index 328bed69..f5901ee5 100644 --- a/web/src/components/projects/integration-form.tsx +++ b/web/src/components/projects/integration-form.tsx @@ -99,7 +99,11 @@ export function IntegrationForm({ projectId }: { projectId: string }) { {activeTab === 'scm' && } {activeTab === 'alerting' && ( - + )}
); From 738cb817eda37fde1cc1eaa8a9485e226a89c8da Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Mon, 27 Apr 2026 15:46:59 +0000 Subject: [PATCH 02/22] fix(alerting): map PM provider to correct discovery capability in container picker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the non-existent `containers` capability (which caused NOT_IMPLEMENTED errors for every provider) with provider-specific mappings: Trello→`boards`, JIRA→`projects`, Linear→`teams`. Disable the Fetch button with an explanatory tooltip for unknown providers. Also simplify the `alertingResultsContainerId ?? undefined` ternary to a plain `??` expression in promptContext.ts. Co-Authored-By: Claude Sonnet 4.6 --- src/agents/shared/promptContext.ts | 3 +-- .../projects/integration-alerting-tab.tsx | 26 +++++++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/agents/shared/promptContext.ts b/src/agents/shared/promptContext.ts index 1b4dcc10..72412f97 100644 --- a/src/agents/shared/promptContext.ts +++ b/src/agents/shared/promptContext.ts @@ -75,8 +75,7 @@ export function buildPromptContext( const terminology = getPromptTerminology(pmProvider?.type); // Fall back to the Sentry-configured results container when no PM backlog is set. - const backlogListId = - listIds.backlogListId ?? (alertingResultsContainerId ? alertingResultsContainerId : undefined); + const backlogListId = listIds.backlogListId ?? alertingResultsContainerId; return { workItemId, diff --git a/web/src/components/projects/integration-alerting-tab.tsx b/web/src/components/projects/integration-alerting-tab.tsx index 257dad6c..187d2880 100644 --- a/web/src/components/projects/integration-alerting-tab.tsx +++ b/web/src/components/projects/integration-alerting-tab.tsx @@ -22,12 +22,33 @@ interface ContainerPickerProps { onChange: (id: string) => void; } +/** + * Maps a PM provider slug to its discovery capability that returns container-like items. + * Trello → "boards", JIRA → "projects", Linear → "teams". + * Falls back to undefined (disabling the fetch button) for unknown providers. + */ +function containerCapabilityForProvider( + provider: string, +): 'boards' | 'projects' | 'teams' | undefined { + const map: Record = { + trello: 'boards', + jira: 'projects', + linear: 'teams', + }; + return map[provider]; +} + function PMContainerPicker({ projectId, pmProvider, value, onChange }: ContainerPickerProps) { + const capability = containerCapabilityForProvider(pmProvider); + const containersMutation = useMutation({ mutationFn: async () => { + if (!capability) { + throw new Error(`No container discovery capability mapped for provider "${pmProvider}"`); + } return (await trpcClient.pm.discovery.discover.mutate({ providerId: pmProvider, - capability: 'containers', + capability, args: {}, projectId, })) as Array<{ id: string; name: string }>; @@ -59,7 +80,8 @@ function PMContainerPicker({ projectId, pmProvider, value, onChange }: Container
{containersMutation.isError && ( @@ -96,7 +143,7 @@ function PMContainerPicker({ projectId, pmProvider, value, onChange }: Container type="text" value={value} onChange={(e) => onChange(e.target.value)} - placeholder="container-id" + placeholder="list-id or status-name" className="ml-1 inline-block h-6 rounded border border-input bg-background px-2 text-xs" />

@@ -104,6 +151,49 @@ function PMContainerPicker({ projectId, pmProvider, value, onChange }: Container ); } +/** + * Renders the Investigation Results container input. + * Shows a `PMContainerPicker` when the provider supports dropdown discovery, + * or falls back to a plain text `Input` for Trello and unconfigured projects. + * Extracted to keep `AlertingTab`'s cognitive complexity below the project limit. + */ +function ContainerInput({ + projectId, + pmProvider, + pmConfig, + value, + onChange, +}: { + projectId: string; + pmProvider: string | undefined; + pmConfig: Record | undefined; + value: string; + onChange: (v: string) => void; +}) { + if (pmProvider && providerPickerConfig(pmProvider)) { + return ( + + ); + } + const placeholder = pmProvider + ? `Enter list ID manually (no picker available for ${pmProvider})` + : 'List ID or status name (configure PM integration to use a picker)'; + return ( + onChange(e.target.value)} + placeholder={placeholder} + /> + ); +} + // ============================================================================ // Alerting Tab (Sentry) // ============================================================================ @@ -113,9 +203,16 @@ interface AlertingTabProps { alertingIntegration?: Record; /** PM provider slug (e.g. "trello", "jira", "linear") when a PM integration is configured. */ pmProvider?: string; + /** The project's existing PM integration config, used to drive the container picker. */ + pmConfig?: Record; } -export function AlertingTab({ projectId, alertingIntegration, pmProvider }: AlertingTabProps) { +export function AlertingTab({ + projectId, + alertingIntegration, + pmProvider, + pmConfig, +}: AlertingTabProps) { const queryClient = useQueryClient(); const existingConfig = (alertingIntegration?.config as Record) ?? {}; @@ -242,21 +339,13 @@ export function AlertingTab({ projectId, alertingIntegration, pmProvider }: Aler The PM list or status where the alerting agent creates investigation work items. Used as the target container when the agent creates bug fix cards.

- {pmProvider ? ( - - ) : ( - setResultsContainerId(e.target.value)} - placeholder="List ID or status name (configure PM integration to use a picker)" - /> - )} +

diff --git a/web/src/components/projects/integration-form.tsx b/web/src/components/projects/integration-form.tsx index f5901ee5..12bab445 100644 --- a/web/src/components/projects/integration-form.tsx +++ b/web/src/components/projects/integration-form.tsx @@ -103,6 +103,7 @@ export function IntegrationForm({ projectId }: { projectId: string }) { projectId={projectId} alertingIntegration={alertingIntegration} pmProvider={pmIntegration ? pmProvider : undefined} + pmConfig={pmIntegration ? (pmIntegration.config as Record) : undefined} /> )} From 6abf45e86319570a23ceedc6af06b72ce091937e Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:21:41 +0000 Subject: [PATCH 05/22] docs(spec): 017 router-silent-failure-hardening + plans Three production hardening fixes derived from the 2026-04-29 24h log/webhook audit. Discovered alongside PR #1220 (workItemId-on-respond-to-* fix). Failure modes: A) Linear PM-ack silently skipped on PM-focused agents (24/day) B) Pipeline-capacity gate fails open on every PM status-changed (32/day) C) Progress-comment double-delete race produces 404 log spam (72/day) Decomposed per spec strategic decision #8 into three independent plans (none blocks another): 1-pm-ack-coverage (consolidate dispatch via manifest registry) 2-capacity-gate-pm-scope (shared adapter helper, fail-closed semantics) 3-progress-comment-double-delete (consumed-flag, gated fallback, 404-DEBUG) This commit ships scaffolding only. Plan 3's implementation lands in this branch; plans 1 and 2 follow as their own PRs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../1-pm-ack-coverage.md | 210 +++++++++++++++++ .../2-capacity-gate-pm-scope.md | 223 ++++++++++++++++++ .../3-progress-comment-double-delete.md | 210 +++++++++++++++++ .../_coverage.md | 54 +++++ .../017-router-silent-failure-hardening.md | 182 ++++++++++++++ 5 files changed, 879 insertions(+) create mode 100644 docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md create mode 100644 docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md create mode 100644 docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md create mode 100644 docs/plans/017-router-silent-failure-hardening/_coverage.md create mode 100644 docs/specs/017-router-silent-failure-hardening.md diff --git a/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md b/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md new file mode 100644 index 00000000..ba4be3b2 --- /dev/null +++ b/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md @@ -0,0 +1,210 @@ +--- +id: 017 +slug: router-silent-failure-hardening +plan: 1 +plan_slug: pm-ack-coverage +level: plan +parent_spec: docs/specs/017-router-silent-failure-hardening.md +depends_on: [] +status: pending +--- + +# 017/1: PM-ack dispatch coverage (Linear branch missing) + +> Part 1 of 3 in the 017-router-silent-failure-hardening plan. See [parent spec](../../specs/017-router-silent-failure-hardening.md). + +## Summary + +This plan fixes failure mode A from spec 017: PM-focused agents (e.g. `backlog-manager`) triggered from a GitHub webhook against a Linear-based project never get their PM-side acknowledgment comment posted, because the router-side dispatch helper `postPMAck` in `src/router/adapters/github.ts:48-66` branches on `pmType` literal strings and lacks the `linear` case. A near-identical helper at `src/triggers/shared/pm-ack.ts:35-49` (`postPMAckComment`) does have the linear branch — the bug is parallel-path drift between two helpers that should have been one. 24 silent skips per day in production (`WARN: Unknown PM type for PM-focused agent ack, skipping`), all from Linear-based projects (`ucho` being the most active). + +The fix consolidates both helpers into one path that consumes `PMProviderManifest.platformClientFactory` from `src/integrations/pm/registry.ts` directly, eliminating per-PM-type literal branching from the dispatch surface entirely. Both call sites (`postPMAck` in `github.ts`, `postPMAckComment` in `pm-ack.ts`) become thin wrappers that delegate to the new helper. Adding a future PM provider to the manifest registry will Just Work for ack dispatch — no edit needed in either call site. + +To prevent regression of the bug class, this plan extends the existing PM manifest conformance harness at `tests/unit/integrations/pm-conformance.test.ts` with one new `it()` per registered provider asserting the new helper successfully dispatches against that provider's `platformClientFactory`. Adding a future PM provider whose factory is misconfigured fails CI loudly. + +The "truly unknown PM type" branch (a project pinned to a PM type that's not in the registry — configuration error) converts from `WARN: Unknown PM type` (silent skip) to ERROR-level + Sentry capture under stable tag `pm_ack_unknown_pm_type`, mirroring the spec-015 `wedged_lock_canary` precedent. + +This plan is independent of plans 2 and 3 — they touch different files and address different failure modes. Sequencing is the implementer's call. + +**Components delivered:** +- New consolidated PM-ack dispatch helper at `src/router/pm-ack-dispatch.ts` (or sibling — final path is `/implement`'s call within the spirit of the contract). Consumes the manifest registry; no `pmType` literal branching. +- Migration of `src/router/adapters/github.ts:postPMAck` to call the new helper. +- Migration of `src/triggers/shared/pm-ack.ts:postPMAckComment` to call the new helper. +- Conversion of the "Unknown PM type" branch from WARN+skip to ERROR+Sentry capture under the stable tag `pm_ack_unknown_pm_type`. +- Conformance-harness extension at `tests/unit/integrations/pm-conformance.test.ts` with one assertion per registered provider. +- Doc update in `src/integrations/README.md` describing the "PM-type dispatch coverage invariant". + +**Deferred to later plans in this spec:** +- Failure mode B (capacity-gate PM scope) — Plan 2. +- Failure mode C (progress-comment double-delete) — Plan 3. + +--- + +## Spec ACs satisfied by this plan + +- Spec AC #1 (Linear-based PM-focused agent posts visible ack comment) — **full** +- Spec AC #2 (CI fails when adding a new PM provider not wired through dispatch) — **full** +- Spec AC #3 (`Unknown PM type` WARN → Sentry-captured error on routine path) — **full** +- Spec AC #10 (24h log volume of all three WARNs drops to <1/24h) — **partial chain** (this plan eliminates the `Unknown PM type for PM-focused agent ack, skipping` line; plans 2 and 3 each eliminate their own WARN. Final volume verification is post-deploy.) + +--- + +## Depends On + +None. Independent of plans 2 and 3 in this spec. + +--- + +## Detailed Task List (TDD) + +### 1. Consolidated PM-ack dispatch helper + +**Tests first** (`tests/unit/router/pm-ack-dispatch.test.ts` — new file): + +- `dispatchPMAck — Trello: invokes platformClientFactory(projectId).postComment(workItemId, message) and returns { commentId, message }` — unit — mock the Trello manifest's factory to return a fake client whose `postComment` returns `'comment-trello-123'`; assert the helper returns `{ commentId: 'comment-trello-123', message }`. Expected red: `Error: Cannot find module '../../../src/router/pm-ack-dispatch.js'`. + +- `dispatchPMAck — JIRA: same shape, returns numeric or string id from platformClientFactory` — unit — mock JIRA factory to return `'jira-789'`; assert helper returns `{ commentId: 'jira-789', message }`. Expected red: `Error: Cannot find module ...`. + +- `dispatchPMAck — Linear: same shape (the failure mode A regression pin)` — unit — mock Linear factory to return `'linear-id-uuid'`; assert helper returns `{ commentId: 'linear-id-uuid', message }`. Expected red: `Error: Cannot find module ...`. After implementation: this is the regression pin proving Linear is reachable from the consolidated helper, which the legacy `postPMAck` did not deliver. + +- `dispatchPMAck — returns undefined when platformClientFactory.postComment returns null` — unit — mock factory whose postComment returns `null` (the existing failure-shape contract per `PlatformCommentClient.postComment`); assert helper returns `undefined`. Expected red: `AssertionError: expected { commentId: null, ... } to be undefined`. + +- `dispatchPMAck — unknown pmType (not in registry): logs at ERROR, captures Sentry with tag pm_ack_unknown_pm_type, returns undefined` — unit — pass `pmType: 'asana'` (not registered); assert (a) `logger.error` called with the message `Unknown PM type for PM-focused agent ack`; (b) `captureException` called with options containing `tags: { source: 'pm_ack_unknown_pm_type' }`; (c) helper returns `undefined`. Expected red: `expected logger.error to have been called` (today only `logger.warn` is called). + +- `dispatchPMAck — does not branch on pmType literal strings` — static — read the helper's source file and assert the strings `=== 'trello'`, `=== 'jira'`, `=== 'linear'` do not appear within the body of the helper. Why: the structural invariant of this plan is "no per-PM-type code in dispatch". A future maintainer who adds an `if (pmType === 'asana')` branch should fail this test. Expected red: passes once the helper consumes the registry; would fail if implementation cheats and re-introduces literal branching. + +**Implementation** (`src/router/pm-ack-dispatch.ts` — new file): + +- Function signature: `dispatchPMAck(opts: { projectId: string; workItemId: string; pmType: string | undefined; message: string; agentType?: string }): Promise<{ commentId: string | number; message: string } | undefined>`. +- Resolves the manifest via `getPMProvider(pmType)` from `src/integrations/pm/registry.ts` (or `listPMProviders().find((m) => m.id === pmType)` — final lookup shape is the implementer's call, but it must NOT branch on string literals). +- If no manifest is found: `logger.error('Unknown PM type for PM-focused agent ack', { agentType, pmType })`, `captureException(new Error('Unknown PM type for PM-focused agent ack'), { tags: { source: 'pm_ack_unknown_pm_type' }, extra: { agentType, pmType } })`, return `undefined`. +- Otherwise: `const client = manifest.platformClientFactory(projectId); const commentId = await client.postComment(workItemId, message); if (commentId == null) return undefined; return { commentId, message };`. +- Sentry capture: import the existing `captureException` helper used by spec-015's `wedged_lock_canary` site at `src/router/active-workers.ts` to keep tag conventions aligned. + +### 2. Migrate `postPMAck` in `src/router/adapters/github.ts` + +**Tests first** (`tests/unit/router/adapters/github-postPMAck.test.ts` — new file, OR extend the existing github-adapter test if one covers postPMAck): + +- `postPMAck — delegates to dispatchPMAck for Linear projects (regression pin)` — unit — call `postPMAck('proj-1', 'MNG-100', 'linear', 'backlog-manager', 'Working...')`; assert `dispatchPMAck` was called once with matching args; assert the return value is the dispatch helper's return value passed through. Expected red: `expected dispatchPMAck to have been called` (today the function does not call it — it has its own local trello/jira branching). + +- `postPMAck — delegates for Trello and JIRA too (no regression)` — unit — both branches still produce expected `AckResult`. Expected red: same as above for the trello/jira parts after refactor. + +- `postPMAck — does not contain pmType literal branching after migration` — static — read `src/router/adapters/github.ts`, locate the `postPMAck` function body, assert the strings `=== 'trello'`, `=== 'jira'`, `=== 'linear'` do not appear within it. Expected red: passes today (it has trello/jira branching) — fails LOUDLY if future drift re-introduces it. + +**Implementation** (`src/router/adapters/github.ts:48-66`): +- Replace the body of `postPMAck` with a call to `dispatchPMAck({ projectId, workItemId, pmType, message, agentType })` and return its result. +- Remove the local `if (pmType === 'trello') ... else if (pmType === 'jira') ... else { logger.warn ... }` chain. +- Remove the imports of `postTrelloAck`, `postJiraAck` if they become unused (verify nothing else in the file imports them). + +### 3. Migrate `postPMAckComment` in `src/triggers/shared/pm-ack.ts` + +**Tests first** (`tests/unit/triggers/shared/pm-ack.test.ts` — new file or extend existing): + +- `postPMAckComment — delegates to dispatchPMAck` — unit — assert it calls `dispatchPMAck` with the same arg shape and returns the unwrapped `commentId` (or `null` on undefined dispatch result, since the existing contract returns `string | null`). Expected red: same "expected to have been called" pattern. + +- `postPMAckComment — preserves the existing return contract (string | null)` — unit — when dispatch returns `{ commentId: 'X', message }`, postPMAckComment returns `'X'`; when dispatch returns `undefined`, postPMAckComment returns `null`. Why: existing callers in `src/triggers/github/webhook-handler.ts:maybePostPmAckComment` rely on this shape. Expected red: passes after the wrapper is correctly threaded. + +- `postPMAckComment — does not contain pmType literal branching after migration` — static — same shape as task 2's static test, applied to `pm-ack.ts`. Expected red: passes today (the file has trello/jira/linear branching) — fails on future regression. + +**Implementation** (`src/triggers/shared/pm-ack.ts:35-49`): +- Replace the body of `postPMAckComment` with `const result = await dispatchPMAck({ projectId, workItemId, pmType, message, agentType }); return result?.commentId == null ? null : String(result.commentId);` — the `String(...)` wrap normalizes the union return type to the existing `string | null` contract. +- Remove the literal-branching chain and the imports of `postTrelloAck`, `postJiraAck`, `postLinearAck` from `../../router/acknowledgments.js` if they become unused. + +### 4. Conformance-harness extension: every registered manifest is reachable from `dispatchPMAck` + +**Tests first** (`tests/unit/integrations/pm-conformance.test.ts` — extend the `describe.each((id, manifest))` block): + +- `dispatchPMAck — successfully dispatches against ${id}'s platformClientFactory` — unit — for each registered provider, mock the factory's `postComment` to return a deterministic id, then call `dispatchPMAck({ projectId: 'proj-test', workItemId: 'item-test', pmType: id, message: 'test' })`; assert the result is `{ commentId: , message: 'test' }`. This test exists ONCE in the harness and runs for every provider via `describe.each`. Expected red: today, calling against `id: 'linear'` returns `undefined` because dispatch goes through the legacy postPMAck which lacks the linear branch. + + Note: the harness today already iterates providers; this `it()` extends that loop. Mock the factory at the manifest level, not the underlying API client — the test asserts the dispatch path, not the API integration. + +- `dispatchPMAck — fails CI when a new provider is added to the registry without a working platformClientFactory` — unit — synthesize a stub manifest with `platformClientFactory: () => ({})` (missing methods); register it temporarily; assert `dispatchPMAck` either throws or fails the conformance check (the test is the assertion that conformance fires). Expected red: today the harness's `platformClientFactory returns a client with postComment + deleteComment methods` test already pins the shape — extend that to actually invoke postComment via dispatch. + +**Implementation** (`tests/unit/integrations/pm-conformance.test.ts`): +- Inside the existing `describe.each(providers.map((p) => [p.id, p] as const))('%s', (id, manifest) => { ... })` block, after the existing `platformClientFactory returns a client with postComment + deleteComment methods` test, add the new `dispatchPMAck — successfully dispatches ...` test. +- Stub the underlying API call surface (`platformClientFactory(projectId)` returns a mocked PlatformCommentClient with controllable `postComment`). +- The test does NOT exercise the real underlying HTTP layer — that's the integration tests' job. The test exercises the dispatch contract. + +### 5. Documentation update + +**Implementation** (`src/integrations/README.md`): +- Add a "PM-type dispatch coverage invariant" callout under the existing "Conformance harness — what CI enforces" section. One paragraph + one bullet: + + > **PM-ack dispatch coverage.** Every PM provider registered in the manifest registry must be reachable from the router-side PM-ack dispatch helper at `src/router/pm-ack-dispatch.ts`. The conformance harness asserts this by invoking `dispatchPMAck` against each registered provider's `platformClientFactory.postComment`. Failing the assertion means a new provider was added without being wired through the dispatch path — fix by ensuring the provider's manifest declares a working `platformClientFactory`. The dispatch helper itself does NOT branch on `pmType` literal strings; it indexes the registry directly. + +- Update the per-provider migration-status section if applicable (no provider table edits — the invariant applies to all current and future providers). + +--- + +## Test Plan + +### Unit tests +- [ ] `tests/unit/router/pm-ack-dispatch.test.ts` (new): 6 tests — Trello/JIRA/Linear happy paths, null-from-postComment, unknown-pmType Sentry capture, no-literal-branching static check. +- [ ] `tests/unit/router/adapters/github-postPMAck.test.ts` (new): 3 tests — delegation, regression for Trello/JIRA, no-literal-branching. +- [ ] `tests/unit/triggers/shared/pm-ack.test.ts` (new or extend): 3 tests — delegation, return-contract preservation, no-literal-branching. + +### Integration / harness tests +- [ ] `tests/unit/integrations/pm-conformance.test.ts` (extend): one new `it()` inside the existing `describe.each`, runs for every registered provider (today: 3 — Trello, JIRA, Linear; harness also runs against the test fixture provider). + +### End-to-end / acceptance +- Out of plan scope. Spec AC #1 ("Linear-based PM-focused agent posts visible ack comment") is verified by the regression pin in `pm-ack-dispatch.test.ts` (the Linear happy-path test) plus the conformance harness assertion. Live verification is operator-side post-deploy — see Manual Verification. + +--- + +## Manual Verification (for `[manual]`-tagged ACs only) + +n/a — all ACs auto-tested. + +(Spec AC #10 has a post-deploy log-volume component, but the CI-verifiable proxy in this plan — "the `Unknown PM type for PM-focused agent ack, skipping` WARN message no longer fires on the routine path" — is testable via the static no-literal-branching check + the dispatch helper's positive-path tests. The 24h-volume measurement is operator-side after merge, tracked in the spec's overall AC #10 closure.) + +--- + +## Acceptance Criteria (per-plan, testable) + +1. The consolidated `dispatchPMAck` helper exists, indexes the manifest registry, does not branch on `pmType` literal strings, and successfully dispatches against every provider whose manifest declares a working `platformClientFactory`. +2. `postPMAck` in `src/router/adapters/github.ts` delegates to `dispatchPMAck` and contains no `pmType` literal branching after migration. +3. `postPMAckComment` in `src/triggers/shared/pm-ack.ts` delegates to `dispatchPMAck`, preserves its existing `string | null` return contract, and contains no `pmType` literal branching after migration. +4. PM-focused agents on Linear-based projects produce a visible PM-side ack comment via the dispatch helper (regression pin: the Linear happy-path test in `pm-ack-dispatch.test.ts`). +5. Genuinely-unknown PM types (configuration error) emit ERROR-level logs and Sentry capture under the stable tag `pm_ack_unknown_pm_type`. The legacy WARN+skip path is removed. +6. The PM manifest conformance harness asserts dispatch coverage for every registered provider; adding a future provider without a working `platformClientFactory` produces a CI failure. +7. `src/integrations/README.md` documents the dispatch coverage invariant. +8. All new/modified code has corresponding tests. +9. `npm run typecheck` passes. +10. `npm test` (full unit suite) passes. +11. `npm run lint` passes. + +--- + +## Documentation Impact (this plan only) + +| File | Change | +|---|---| +| `src/integrations/README.md` | Add "PM-ack dispatch coverage" subsection under the "Conformance harness — what CI enforces" section. Document: every registered manifest must be reachable from `dispatchPMAck`; dispatch must not branch on `pmType` literal strings. Reference the conformance harness as the regression net. | +| `CHANGELOG.md` | Entry: "PM-ack dispatch consolidation: Linear-based PM-focused agents now post their PM-side ack comment via a registry-consuming helper. Eliminates the silent skip on Linear projects." | + +--- + +## Out of Scope (this plan) + +- Failure mode B (capacity-gate PM scope) — Plan 2. +- Failure mode C (progress-comment double-delete) — Plan 3. +- Generalising the dispatch helper to non-PM ack flows (GitHub PR ack, alerting category, etc.). Only PM-focused agent ack dispatch from the router-after-GitHub-webhook path is migrated here. +- Migrating the legacy underlying `postTrelloAck` / `postJiraAck` / `postLinearAck` helpers in `src/router/acknowledgments.ts` (they remain as the platformClientFactory implementations consumed indirectly via the manifest registry; consolidating those is a larger refactor for a future spec). +- Any changes to the agent-side `PMIntegration.deleteAckComment` or to the gadgets that delete ack comments mid-run (those are in Plan 3's scope). +- The dashboard UI's representation of PM-side ack comments. The UX is unchanged; the fix is "the comment is posted at all on Linear". + +--- + +## Progress + + +- [ ] AC #1 +- [ ] AC #2 +- [ ] AC #3 +- [ ] AC #4 +- [ ] AC #5 +- [ ] AC #6 +- [ ] AC #7 +- [ ] AC #8 +- [ ] AC #9 +- [ ] AC #10 +- [ ] AC #11 diff --git a/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md b/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md new file mode 100644 index 00000000..d029119d --- /dev/null +++ b/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md @@ -0,0 +1,223 @@ +--- +id: 017 +slug: router-silent-failure-hardening +plan: 2 +plan_slug: capacity-gate-pm-scope +level: plan +parent_spec: docs/specs/017-router-silent-failure-hardening.md +depends_on: [] +status: pending +--- + +# 017/2: Pipeline-capacity-gate PM-provider scope + +> Part 2 of 3 in the 017-router-silent-failure-hardening plan. See [parent spec](../../specs/017-router-silent-failure-hardening.md). + +## Summary + +This plan fixes failure mode B from spec 017: the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts:38` is silently no-op for every PM `status-changed` trigger because the three PM router adapters wrap `triggerRegistry.dispatch(ctx)` in their credential `AsyncLocalStorage` scope but NOT in the PM-provider scope. The gate calls `getPMProvider()`, throws on scope miss, conservatively logs `WARN: pipeline-capacity-gate: PM provider unavailable, allowing run`, and returns `false` (allow). 32 silent skips per day in production. Net effect: `maxInFlightItems` is essentially unenforced for the entire PM-source path — exactly the regression the gate was added to prevent (per the file header comment, the prior incident where three concurrent implementation runs fired against a `maxInFlightItems: 1` project after a human moved three cards into TODO simultaneously). + +The fix introduces a shared helper `withPMScopeForDispatch(project, dispatch)` that resolves the project's PM provider via the manifest registry and wraps the dispatch invocation in `withPMProvider`, mirroring the GitHub router adapter's existing correct shape at `src/router/adapters/github.ts:280`. The three PM router adapters (`linear.ts:215-242`, `trello.ts:104-130`, `jira.ts:104-132`) call this helper instead of wrapping themselves. Future PM router adapters consume the same helper for free; a static guard test asserts the invariant per adapter. + +Once the routine path establishes scope, the gate's "PM provider unavailable" branch becomes a real anomaly. This plan converts that branch from `WARN + return false` (allow) to `ERROR + Sentry capture under tag pipeline_capacity_gate_no_pm_provider + return true` (block). The "allow when not slot-consuming" branch (the existing early return for non-`implementation` agents) is preserved unchanged. The gate's positive path (PM provider in scope, pipeline-over-capacity check returns a real answer) is also preserved. + +This plan is independent of plans 1 and 3 — they touch different files and address different failure modes. Sequencing is the implementer's call. + +**Components delivered:** +- New shared helper `withPMScopeForDispatch(project, dispatch)` in `src/router/adapters/_shared.ts` (or sibling). Resolves PM provider via the manifest registry; wraps in `withPMProvider`; calls `dispatch`. +- Migration of `src/router/adapters/linear.ts:dispatchWithCredentials` to consume the helper. +- Migration of `src/router/adapters/trello.ts:dispatchWithCredentials` to consume the helper. +- Migration of `src/router/adapters/jira.ts:dispatchWithCredentials` to consume the helper. +- Conversion of `src/triggers/shared/pipeline-capacity-gate.ts:38` from WARN+allow to ERROR+Sentry+block under the stable tag `pipeline_capacity_gate_no_pm_provider`. +- Static guard at `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` asserting every PM router adapter consumes the shared helper (or wraps in `withPMProvider` directly within its dispatch path). +- Doc update in `CLAUDE.md` describing the capacity-gate invariant. + +**Deferred to later plans in this spec:** +- Failure mode A (PM-ack dispatch coverage) — Plan 1. +- Failure mode C (progress-comment double-delete) — Plan 3. + +--- + +## Spec ACs satisfied by this plan + +- Spec AC #4 (capacity gate enforces M of N at the limit under the canonical incident scenario) — **full** +- Spec AC #5 (CI fails when adding a new PM router adapter without PM-provider scope wrapping) — **full** +- Spec AC #6 (`pipeline-capacity-gate: PM provider unavailable` WARN → Sentry-captured error on routine path) — **full** +- Spec AC #10 (24h log volume of all three WARNs drops to <1/24h) — **partial chain** (this plan eliminates the `pipeline-capacity-gate: PM provider unavailable, allowing run` line; plans 1 and 3 each eliminate their own WARN. Final volume verification is post-deploy.) + +--- + +## Depends On + +None. Independent of plans 1 and 3 in this spec. + +--- + +## Detailed Task List (TDD) + +### 1. Shared `withPMScopeForDispatch` helper + +**Tests first** (`tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts` — new file): + +- `withPMScopeForDispatch — resolves the project's PM provider via manifest registry and runs dispatch inside withPMProvider scope` — unit — call helper with a project whose `pm.type === 'linear'`; pass a `dispatch` callback that internally calls `getPMProvider()`; assert `dispatch` is called once and the inner `getPMProvider()` returns the Linear provider (no throw). Expected red: `Error: Cannot find module '...with-pm-scope-for-dispatch.js'`. + +- `withPMScopeForDispatch — returns whatever dispatch returns (preserves TriggerResult passthrough)` — unit — pass a `dispatch` that returns `{ agentType: 'review', agentInput: {} }`; assert the helper's return value deep-equals that object. Why: the existing PM adapters return the dispatch result directly to their caller; the wrapping must not alter shape. Expected red: same module-not-found. + +- `withPMScopeForDispatch — when project's pm.type is not in the registry: throws with a clear error before dispatch fires` — unit — pass a project with `pm.type === 'asana'` (not registered); assert helper throws `Error('No PM manifest registered for type: asana')` (or similar, matching the manifest registry's existing missing-key error shape) and `dispatch` is NOT called. Why: failing here is preferable to dispatching with a missing scope, since the gate would then fail-closed and block legitimate runs. The adapter's caller (router) already handles dispatch errors. Expected red: same module-not-found. + +- `withPMScopeForDispatch — does not establish credential scope (that's the adapter's job)` — unit — assert helper does NOT call `withLinearCredentials` / `withTrelloCredentials` / `withJiraCredentials`. The credential scope is each adapter's responsibility; this helper layers PM-provider scope ON TOP. Expected red: passes if the implementation correctly avoids the credential-scope wrappers; fails if implementation accidentally double-wraps. + +**Implementation** (`src/router/adapters/_shared.ts` — new file or addition to existing _shared module): + +- Function signature: `function withPMScopeForDispatch(project: ProjectConfig, dispatch: () => Promise): Promise`. +- Body: resolve `pmProvider` via `pmRegistry.getOrThrow(project.pm.type)` (or equivalent — the manifest registry already exposes a getter). Wrap in `withPMProvider(pmProvider, dispatch)` from `src/pm/context.ts` and return its result. +- Do NOT add credential scoping; that's each adapter's concern. + +### 2. Migrate `src/router/adapters/linear.ts:dispatchWithCredentials` + +**Tests first** (`tests/unit/router/adapters/linear-dispatch-pm-scope.test.ts` — new file or extend existing): + +- `LinearRouterAdapter.dispatchWithCredentials — establishes both Linear credentials AND PM-provider scope before dispatch` — unit — set up a Linear-based project with credentials in DB; spy on the trigger handler's invocation to check that BOTH `getLinearCredentials()` AND `getPMProvider()` resolve successfully when called from inside the dispatched handler. Expected red: today, `getPMProvider()` throws `Error: No PMProvider in scope` because the dispatch is wrapped only in `withLinearCredentials`. + +- `LinearRouterAdapter.dispatchWithCredentials — returns whatever the trigger registry returns (no shape change)` — unit — mock the trigger registry to return a known `TriggerResult`; assert the adapter's return value deep-equals that. Regression pin against accidentally swallowing the result during the wrap migration. Expected red: test passes today (the existing flow returns the result), should continue to pass after migration — fails LOUDLY if the wrapping is implemented as a fire-and-forget. + +**Implementation** (`src/router/adapters/linear.ts:215-242`): +- Inside `dispatchWithCredentials`, after the existing `withLinearCredentials({ apiKey: linearCreds.apiKey }, ...)` call, replace the inner `() => triggerRegistry.dispatch(ctx)` with `() => withPMScopeForDispatch(fullProject, () => triggerRegistry.dispatch(ctx))`. +- Import the shared helper from `./_shared.js`. + +### 3. Migrate `src/router/adapters/trello.ts:dispatchWithCredentials` + +**Tests first** (`tests/unit/router/adapters/trello-dispatch-pm-scope.test.ts` — new file or extend existing): + +- `TrelloRouterAdapter.dispatchWithCredentials — establishes both Trello credentials AND PM-provider scope before dispatch` — unit — same shape as Linear's test. Expected red: same `No PMProvider in scope` throw. + +- `TrelloRouterAdapter.dispatchWithCredentials — returns the dispatch result unchanged` — unit — same shape. Expected red: regression pin. + +**Implementation** (`src/router/adapters/trello.ts:104-130`): +- Same pattern as Linear: wrap `() => triggerRegistry.dispatch(ctx)` inside `withPMScopeForDispatch(fullProject, ...)` inside the existing `withTrelloCredentials(...)`. + +### 4. Migrate `src/router/adapters/jira.ts:dispatchWithCredentials` + +**Tests first** (`tests/unit/router/adapters/jira-dispatch-pm-scope.test.ts` — new file or extend existing): + +- `JiraRouterAdapter.dispatchWithCredentials — establishes both JIRA credentials AND PM-provider scope before dispatch` — unit — same shape. Expected red: same `No PMProvider in scope` throw. + +- `JiraRouterAdapter.dispatchWithCredentials — returns the dispatch result unchanged` — unit — regression pin. + +**Implementation** (`src/router/adapters/jira.ts:104-132`): +- Same pattern. + +### 5. Capacity-gate fail-closed semantics + +**Tests first** (`tests/unit/triggers/shared/pipeline-capacity-gate.test.ts` — new or extend): + +- `shouldBlockForPipelineCapacity — when getPMProvider() throws (scope miss): returns true (block), logs at ERROR, captures Sentry under tag pipeline_capacity_gate_no_pm_provider` — unit — call the gate without setting up `withPMProvider` scope; mock `getPMProvider` to throw the existing `No PMProvider in scope` error; assert (a) returns `true`; (b) `logger.error` called; (c) `captureException` called with `tags: { source: 'pipeline_capacity_gate_no_pm_provider' }`. Expected red: today returns `false` and only logs at WARN; assertion `expected true to be false` (semantics flipped) is the headline failure. + +- `shouldBlockForPipelineCapacity — when PM provider IS in scope: runs the existing isActivePipelineOverCapacity check and returns its boolean result` — unit — set up scope; mock `isActivePipelineOverCapacity` to return `{ overCapacity: true, inFlightCount: 3, limit: 1 }`; assert gate returns `true`; mock to return `{ overCapacity: false, inFlightCount: 0, limit: 1 }`; assert gate returns `false`. Regression pin against accidentally breaking the routine path during the fail-closed conversion. Expected red: passes today on the positive paths; fails if the fail-closed change accidentally short-circuits the over-capacity branch. + +- `shouldBlockForPipelineCapacity — non-slot-consuming agent type (e.g. 'review'): early-returns false without consulting the provider` — unit — call gate with `agentType: 'review'`; assert returns `false` and `getPMProvider` is NOT called. Regression pin: the existing early-return for `!SLOT_CONSUMING_AGENTS.has(args.agentType)` must survive the migration. Expected red: passes today; fails if implementation removes the early return. + +**Implementation** (`src/triggers/shared/pipeline-capacity-gate.ts:33-45`): +- Replace the `try { provider = getPMProvider(); } catch (err) { logger.warn(...); return false; }` block with: `try { provider = getPMProvider(); } catch (err) { logger.error('pipeline-capacity-gate: PM provider unavailable, blocking run', { source, projectId, workItemId, error: String(err) }); captureException(err, { tags: { source: 'pipeline_capacity_gate_no_pm_provider' }, extra: { projectId, workItemId, agentType, triggerSource: source } }); return true; }`. +- Keep the rest of the function (the `isActivePipelineOverCapacity` check and the existing `pipeline-at-capacity` info log) unchanged. + +### 6. Static guard: every PM router adapter establishes PM-provider scope + +**Tests first** (`tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` — new file): + +- `every PM router adapter's dispatchWithCredentials path establishes PM-provider scope` — static — for each registered manifest, locate the adapter source file (Linear/Trello/JIRA today; future-extensible via a known src path convention or a registry export). Read the file source. Assert that within the `dispatchWithCredentials` method body, EITHER `withPMScopeForDispatch` is called OR `withPMProvider` is called directly. Expected red: today, none of the three PM router adapter files contains either string within `dispatchWithCredentials`; the static check would fail and report the missing wrapping per adapter. + + Note: a static-string check is intentionally chosen over runtime dependency injection because (a) the existing `trigger-event-consistency.test.ts` already establishes the file-level static-grep pattern, (b) runtime tests on each adapter cover the behavioral side (tasks 2-4), and (c) static guards are cheap and produce precise file:line failure messages. + +- `the static guard fails LOUDLY when a future PM router adapter omits the wrapping` — meta-test — synthesize a temporary fake adapter file that lacks either reference; run the guard against it; assert it fails with a message naming the missing reference. Expected red: depends on implementation of the meta-test; can be skipped if it adds too much complexity (the per-adapter unit tests in tasks 2-4 cover the same invariant from a different angle). + +**Implementation** (`tests/unit/integrations/pm-router-adapter-pm-scope.test.ts`): +- File pattern modeled on `trigger-event-consistency.test.ts`: iterate each adapter file under `src/router/adapters/{linear,trello,jira}.ts` (and future adapters via a glob or hardcoded list — the spec is small enough for the hardcoded list to be acceptable, with the comment that adding a new PM adapter requires adding it here too). +- For each, read the file source, locate the `dispatchWithCredentials` method (regex on `dispatchWithCredentials\\s*\\([^)]*\\)\\s*[^{]*\\{` and capture the body via brace-counting OR a simpler "the file mentions `withPMScopeForDispatch` OR `withPMProvider` somewhere within the file" heuristic — `/implement` chooses the simplest reliable pattern). +- Fail with a message like `expected ${file} to invoke withPMScopeForDispatch or withPMProvider; neither was found. The PM router adapter must wrap trigger dispatch in PM-provider AsyncLocalStorage scope.` + +### 7. Documentation update (CLAUDE.md) + +**Implementation** (`CLAUDE.md`): +- Add a small subsection under the existing "PM Integration Architecture" pointer (the line: `For adding a new PM provider, see @src/integrations/README.md`). The new content: + + > **Capacity-gate invariant.** Every PM router adapter must wrap `triggerRegistry.dispatch(ctx)` in PM-provider `AsyncLocalStorage` scope (use the shared `withPMScopeForDispatch` helper at `src/router/adapters/_shared.ts`). Without this, the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts` cannot resolve the project's PM provider, fails closed (blocks the run) under the spec-017 fail-closed policy, and Sentry captures under tag `pipeline_capacity_gate_no_pm_provider`. The static guard at `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` enforces this at CI time. + +--- + +## Test Plan + +### Unit tests +- [ ] `tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts` (new): 4 tests — happy path, return-value passthrough, unregistered-pmType throw, no-credential-scope-double-wrap. +- [ ] `tests/unit/router/adapters/linear-dispatch-pm-scope.test.ts` (new): 2 tests — provider scope established, dispatch result preserved. +- [ ] `tests/unit/router/adapters/trello-dispatch-pm-scope.test.ts` (new): 2 tests — same shape. +- [ ] `tests/unit/router/adapters/jira-dispatch-pm-scope.test.ts` (new): 2 tests — same shape. +- [ ] `tests/unit/triggers/shared/pipeline-capacity-gate.test.ts` (new or extend): 3 tests — fail-closed-on-scope-miss, positive-path-works, non-slot-consuming-early-return. + +### Integration / static-guard tests +- [ ] `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` (new): 1-2 tests — static guard per registered adapter. + +### End-to-end / acceptance +- Spec AC #4 (canonical incident scenario reproduces clean) is covered by the combination of (a) the per-adapter scope-establishment tests in tasks 2-4, (b) the gate's positive-path test in task 5, and (c) the existing `isActivePipelineOverCapacity` unit tests for the in-flight count semantics. A full integration test that simulates "human moves N cards into TODO" is heavy and provides only marginal additional confidence over the unit-level coverage; defer unless `/implement` finds a gap. + +--- + +## Manual Verification (for `[manual]`-tagged ACs only) + +n/a — all ACs auto-tested. + +(Spec AC #10 has a post-deploy log-volume component, but the CI-verifiable proxy in this plan — "the `pipeline-capacity-gate: PM provider unavailable, allowing run` WARN message no longer fires on the routine path" — is testable via the migrated-adapter unit tests + the gate's fail-closed-on-scope-miss test. The 24h-volume measurement is operator-side after merge, tracked in the spec's overall AC #10 closure.) + +--- + +## Acceptance Criteria (per-plan, testable) + +1. The shared `withPMScopeForDispatch` helper exists, resolves PM providers via the manifest registry, and wraps a passed dispatch callback in `withPMProvider` scope. +2. The Linear, Trello, and JIRA router adapters each consume the shared helper inside their `dispatchWithCredentials` method (in addition to their existing per-provider credential scope). +3. `getPMProvider()` succeeds inside any trigger handler dispatched from any of the three PM router adapters. +4. The capacity-gate's fail-closed branch (PM-provider scope miss): emits ERROR-level log, captures to Sentry under stable tag `pipeline_capacity_gate_no_pm_provider`, returns `true` (block). The previous `WARN + return false` (allow) behavior is removed. +5. The capacity-gate's positive path (PM provider in scope) still calls `isActivePipelineOverCapacity` and returns its boolean result unchanged. The non-slot-consuming early-return (for non-`implementation` agents) is preserved unchanged. +6. The static guard at `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` asserts every PM router adapter's `dispatchWithCredentials` references either the shared helper or `withPMProvider` directly. CI fails when a new PM router adapter is added without the wrapping. +7. `CLAUDE.md` documents the capacity-gate invariant and points at the static guard as the regression net. +8. All new/modified code has corresponding tests. +9. `npm run typecheck` passes. +10. `npm test` (full unit suite) passes. +11. `npm run lint` passes. + +--- + +## Documentation Impact (this plan only) + +| File | Change | +|---|---| +| `CLAUDE.md` | Add "Capacity-gate invariant" subsection under the existing "PM Integration Architecture" pointer. Document the wrap-in-PM-provider-scope rule, the fail-closed policy, the Sentry tag, and the static guard. | +| `CHANGELOG.md` | Entry: "Pipeline-capacity gate now enforces `maxInFlightItems` for PM `status-changed` triggers (Linear/Trello/JIRA). Previously the gate was silently no-op on the PM-source path due to a missing AsyncLocalStorage wrapping in the PM router adapters." | + +--- + +## Out of Scope (this plan) + +- Failure mode A (PM-ack dispatch coverage) — Plan 1. +- Failure mode C (progress-comment double-delete) — Plan 3. +- Reworking the GitHub router adapter's `dispatchWithCredentials` (already correct; the new shared helper is offered for reuse but the GitHub adapter doesn't have to migrate to it in this plan — that would be a follow-up cleanup). +- Reworking `isActivePipelineOverCapacity` itself or the in-flight counting query. The gate's logic is correct; the bug is that it never runs in scope. +- The `maxInFlightItems` configuration UX or per-project capacity-cap defaults. +- Any non-`implementation` agent type's gating policy. Only `implementation` is slot-consuming today; the early-return for other types is preserved. +- Stray org-level webhooks against unprovisioned repos (operator-side cleanup, see spec Out-of-Scope list). + +--- + +## Progress + + +- [ ] AC #1 +- [ ] AC #2 +- [ ] AC #3 +- [ ] AC #4 +- [ ] AC #5 +- [ ] AC #6 +- [ ] AC #7 +- [ ] AC #8 +- [ ] AC #9 +- [ ] AC #10 +- [ ] AC #11 diff --git a/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md b/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md new file mode 100644 index 00000000..334b7021 --- /dev/null +++ b/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md @@ -0,0 +1,210 @@ +--- +id: 017 +slug: router-silent-failure-hardening +plan: 3 +plan_slug: progress-comment-double-delete +level: plan +parent_spec: docs/specs/017-router-silent-failure-hardening.md +depends_on: [] +status: pending +--- + +# 017/3: Progress-comment double-delete race + +> Part 3 of 3 in the 017-router-silent-failure-hardening plan. See [parent spec](../../specs/017-router-silent-failure-hardening.md). + +## Summary + +This plan fixes failure mode C from spec 017: the post-agent success hook at `src/triggers/github/ack-comments.ts:23-58` (`deleteProgressCommentOnSuccess`) issues a redundant DELETE against the GitHub comments API for a comment that an in-run gadget has already deleted. The hook reads `sessionState.initialCommentId` (which the gadget at `src/gadgets/sessionState.ts:169-183` sets to `null` after its own successful delete) and falls back to `result.agentInput.ackCommentId` when session state is empty. Because "session state cleared by the gadget" is indistinguishable from "session state never populated", the fallback fires and re-deletes the already-gone comment. GitHub returns 404, `WARN: Failed to delete progress comment after agent success` is logged. 72 occurrences in 24h on production cascade-router, all functionally harmless (the comment IS deleted, just by the gadget rather than the hook), but the log noise dominates WARN volume and obscures real failures. + +The fix introduces an explicit boolean flag `initialCommentIdConsumed: boolean` on session state. The gadget's `deleteInitialComment` sets the flag to `true` after a successful delete (whether GitHub returns 200 or 404). The post-agent hook reads the flag first; if `true`, it skips entirely. If `false`, the existing fallback chain (session state → agent-input copy) still works for legacy paths that never populated session state. As a defense-in-depth measure, the GitHub client's `deletePRComment` is taught to treat HTTP 404 as success-equivalent (logging a single DEBUG line instead of letting the error bubble as a WARN). Other HTTP failures (5xx, auth, throttling) continue to log at WARN. The 404-on-DELETE downgrade aligns with RFC-7231 idempotency baseline behavior and standard practice in cloud SDKs. + +This plan is independent of plans 1 and 2 — they touch different files and address different failure modes. Sequencing is the implementer's call. + +**Components delivered:** +- New `initialCommentIdConsumed: boolean` field on `SessionStateData` in `src/gadgets/sessionState.ts`. Default `false`. +- Updated `SessionState.deleteInitialComment` in `src/gadgets/sessionState.ts:169-183` to set the consumed flag after a successful delete (200 or 404). +- Updated `deleteProgressCommentOnSuccess` in `src/triggers/github/ack-comments.ts:23-58` to gate the entire deletion path (including the agent-input fallback) on `!consumed`. +- Idempotent `deletePRComment` in `src/github/client.ts:235`: HTTP 404 logs at DEBUG and returns successfully; other HTTP errors continue to throw. +- No documentation impact beyond the CHANGELOG entry (lifecycle detail is plan-level, per spec 017's doc-impact rubric). + +**Deferred to later plans in this spec:** +- Failure mode A (PM-ack dispatch coverage) — Plan 1. +- Failure mode B (capacity-gate PM scope) — Plan 2. + +--- + +## Spec ACs satisfied by this plan + +- Spec AC #7 (no double-delete after gadget-cleared comment) — **full** +- Spec AC #8 (legacy fallback still works for paths that never populated session state) — **full** +- Spec AC #9 (404-on-DELETE downgraded to DEBUG, single Sentry breadcrumb preserved) — **full** +- Spec AC #10 (24h log volume of all three WARNs drops to <1/24h) — **partial chain** (this plan eliminates the `Failed to delete progress comment after agent success` line; plans 1 and 2 each eliminate their own WARN. Final volume verification is post-deploy.) + +--- + +## Depends On + +None. Independent of plans 1 and 2 in this spec. + +--- + +## Detailed Task List (TDD) + +### 1. Session-state `initialCommentIdConsumed` flag + +**Tests first** (`tests/unit/gadgets/sessionState.test.ts` — new or extend existing): + +- `SessionStateData — fresh state has initialCommentIdConsumed: false` — unit — instantiate via `createSessionState()`; read via `getSessionState()`; assert `initialCommentIdConsumed === false`. Expected red: `AssertionError: expected undefined to be false` (the field doesn't exist yet). + +- `setInitialCommentId — does NOT flip the consumed flag` — unit — call `setInitialCommentId(123)`; assert `getSessionState().initialCommentId === 123` AND `getSessionState().initialCommentIdConsumed === false`. Why: setting an id is "we now have an active comment", not "we've already disposed of it". Expected red: same field-doesn't-exist failure. + +- `deleteInitialComment — on success: clears initialCommentId to null AND sets initialCommentIdConsumed to true` — unit — set id to `123`; mock `githubClient.deletePRComment` to resolve successfully; call `deleteInitialComment('owner', 'repo')`; assert state is `{ initialCommentId: null, initialCommentIdConsumed: true }`. Expected red: `AssertionError: expected undefined to be true` (no consumed flag set). + +- `deleteInitialComment — on HTTP 404 (idempotent): clears AND sets consumed (the comment is gone either way)` — unit — set id to `123`; mock `deletePRComment` to throw a 404-shaped error; call `deleteInitialComment`; assert state is `{ initialCommentId: null, initialCommentIdConsumed: true }`. Why: a 404 means someone else (e.g. a user) already deleted it; functionally equivalent to success. Expected red: `AssertionError: expected initialCommentId to be null` — today the implementation restores the id on any catch. + +- `deleteInitialComment — on other HTTP error (e.g. 5xx): does NOT set consumed; restores initialCommentId so the post-agent hook can retry` — unit — set id to `123`; mock `deletePRComment` to throw a 500; call `deleteInitialComment`; assert state is `{ initialCommentId: 123, initialCommentIdConsumed: false }`. Why: a transient failure should leave the post-agent hook authorized to try again. Expected red: today the implementation already restores the id on any catch; this test pins the existing positive behavior plus the new "consumed flag stays false" assertion. The latter half is the real red. + +- `deleteInitialComment — when initialCommentId is already null: no-ops without flipping consumed` — unit — start with `{ initialCommentId: null, initialCommentIdConsumed: false }`; call `deleteInitialComment`; assert state is unchanged. Regression pin against accidentally setting consumed when there was nothing to consume. Expected red: passes today (the existing `if (!commentId) return;` short-circuit covers this); fails if implementation accidentally moves the consumed-flip outside that guard. + +**Implementation** (`src/gadgets/sessionState.ts`): + +- Extend `SessionStateData` interface with `initialCommentIdConsumed: boolean`. +- Initialize to `false` in the default `state` object. +- Update `deleteInitialComment(owner, repo)`: + - Existing logic: if `!commentId` return early. + - Existing logic: clear `state.initialCommentId = null` BEFORE calling delete. + - On successful delete OR 404 response: set `state.initialCommentIdConsumed = true`. + - On other failure: restore `state.initialCommentId = commentId` (existing behavior); do NOT set the consumed flag. +- Implementation hint: distinguish 404 from other errors via the GitHub client's error shape (Octokit's `RequestError.status === 404`), OR by relying on Plan 3's task 4 (`deletePRComment` itself catching 404 and returning success). The latter approach is cleaner — once `deletePRComment` no longer throws on 404, the gadget's existing try/catch naturally treats 404 as success and the consumed flag flips. + +### 2. Post-agent hook gates fallback on consumed flag + +**Tests first** (`tests/unit/triggers/github/ack-comments.test.ts` — new or extend existing): + +- `deleteProgressCommentOnSuccess — when initialCommentIdConsumed: skips entirely (no DELETE issued, no fallback to agentInput.ackCommentId)` — unit — set session state to `{ initialCommentId: null, initialCommentIdConsumed: true }`; pass a `result` whose `agentInput.ackCommentId === 4341389855`; spy on `githubClient.deletePRComment`; call hook; assert `deletePRComment` was NOT called. Expected red: today the hook reads `initialCommentId ?? agentInput.ackCommentId ?? null`, sees null in session state, falls through to the agentInput value, and DELETES. The assertion `expected deletePRComment to not be called` fails with the actual call args. + +- `deleteProgressCommentOnSuccess — when not consumed and session state has initialCommentId: deletes that id` — unit — set state to `{ initialCommentId: 123, initialCommentIdConsumed: false }`; call hook; assert `deletePRComment(owner, repo, 123)` is called. Regression pin: the hook's primary path still works. Expected red: passes today; ensures we don't accidentally break the primary path during the gating change. + +- `deleteProgressCommentOnSuccess — when not consumed and session state is empty but agentInput has ackCommentId: deletes via the legacy fallback (Spec AC #8 regression pin)` — unit — set state to `{ initialCommentId: null, initialCommentIdConsumed: false }`; pass result with `agentInput.ackCommentId === 999`; assert `deletePRComment(owner, repo, 999)` is called. Why: paths that never populate session state continue to work. Expected red: passes today; pins the existing legacy-path behavior so the gating change doesn't break it. + +- `deleteProgressCommentOnSuccess — when consumed and session state empty and agentInput has ackCommentId: skips (the consumed flag wins over the legacy fallback)` — unit — set state to `{ initialCommentId: null, initialCommentIdConsumed: true }`; pass result with `agentInput.ackCommentId === 999`; assert `deletePRComment` was NOT called. This is the headline regression pin for the bug class. Expected red: today the hook deletes the agentInput value. Assertion `expected deletePRComment to not be called` fails with the actual call. + +- `deleteProgressCommentOnSuccess — implementation agent: still no-ops (existing exclusion preserved)` — unit — pass `result.agentType === 'implementation'`; assert `deletePRComment` not called. Regression pin against the existing `if (result.agentType === 'implementation') return;` short-circuit. Expected red: passes today. + +- `deleteProgressCommentOnSuccess — PM-focused agent (e.g. backlog-manager): still no-ops (existing exclusion preserved)` — unit — pass `result.agentType === 'backlog-manager'`; mock `isPMFocusedAgent` to return true; assert `deletePRComment` not called. Regression pin. Expected red: passes today. + +**Implementation** (`src/triggers/github/ack-comments.ts:23-58`): +- Inside `deleteProgressCommentOnSuccess`, after the existing `isPMFocusedAgent` and `parseRepoFullName` short-circuits, read `sessionState.initialCommentIdConsumed`. +- If `consumed === true`: return early (no delete, no log). +- Otherwise: existing behavior — fall through to the `initialCommentId ?? agentInput.ackCommentId ?? null` resolution and call `safeOperation(() => githubClient.deletePRComment(...))`. + +### 3. (covered as part of task 1 — gadget consume-flag flip) + +(no separate task — combined into task 1's `deleteInitialComment` update.) + +### 4. Idempotent `deletePRComment` in GitHub client (404 → DEBUG) + +**Tests first** (`tests/unit/github/client.test.ts` — new or extend existing — likely the existing client test suite): + +- `deletePRComment — on HTTP 200: returns successfully without logging` — unit — mock Octokit's `issues.deleteComment` to resolve `{ status: 204 }` (GitHub's actual success status for delete-issue-comment); assert function returns; assert no logger calls fired. Regression pin: the success path is unchanged. Expected red: passes today. + +- `deletePRComment — on HTTP 404: returns successfully, logs at DEBUG with commentId, does NOT throw` — unit — mock Octokit to throw a `RequestError`-shaped object with `status === 404`; assert function returns (no throw propagates); assert `logger.debug` called once with the comment id in the structured payload; assert `logger.warn` and `logger.error` NOT called. Expected red: today the function lets the error propagate; assertion `expected function to not throw` fails with the actual error. + +- `deletePRComment — on HTTP 5xx: throws (existing behavior preserved)` — unit — mock Octokit to throw a `RequestError` with `status === 503`; assert function throws. Regression pin: real failures still bubble. Expected red: passes today. + +- `deletePRComment — on HTTP 401: throws (existing behavior preserved)` — unit — same shape with 401; assert throws. Regression pin against accidentally swallowing auth failures. Expected red: passes today. + +- `deletePRComment — on network error (no HTTP status): throws (existing behavior preserved)` — unit — mock Octokit to throw a plain `Error` without a `status` field; assert throws. Regression pin against the 404-catch swallowing non-HTTP errors. Expected red: passes today. + +**Implementation** (`src/github/client.ts:235`): +- Wrap the `getClient().issues.deleteComment(...)` call in a try/catch. +- In the catch: if the error has `status === 404` (Octokit's `RequestError` shape), call `logger.debug('GitHub progress comment already deleted (404 on DELETE)', { commentId, owner, repo })` and return without rethrowing. +- For any other error (including network errors with no status, 5xx, 4xx other than 404, 401): rethrow as-is. The caller's existing `safeOperation` wrapper at `src/triggers/github/ack-comments.ts:54` continues to log other failures at WARN. +- Do NOT add a generic retry loop; the 404 downgrade is the only behavioral change. + +### 5. Integration: gadget-then-hook flow does not 404 + +**Tests first** (`tests/unit/gadgets/sessionState-and-ack-comments.integration.test.ts` — new file, optional — the per-component tests above already pin the headline behavior; this is belt-and-suspenders): + +- `gadget mid-run delete + post-agent hook: exactly one DELETE is issued, the second invocation is suppressed by the consumed flag` — unit-level integration — set up a fresh session state; call `setInitialCommentId(456)`; call gadget's `deleteInitialComment('owner', 'repo')` (mock `deletePRComment` to resolve); call hook's `deleteProgressCommentOnSuccess` with a `result` whose `agentInput.ackCommentId === 456`; assert `deletePRComment` was called exactly ONCE total. Expected red: today, twice. + + Optional. The per-component tests already cover this end-to-end. + +**Implementation** — n/a, this test exercises the seam between the two components without new code. + +--- + +## Test Plan + +### Unit tests +- [ ] `tests/unit/gadgets/sessionState.test.ts` (new or extend): 6 tests — fresh-state default, set-id-doesn't-flip, delete-success-flips, delete-404-flips, delete-other-error-doesn't-flip, no-id-noop. +- [ ] `tests/unit/triggers/github/ack-comments.test.ts` (new or extend): 6 tests — consumed-skips, primary-path, legacy-fallback, consumed-trumps-fallback, implementation-agent-noop, PM-focused-agent-noop. +- [ ] `tests/unit/github/client.test.ts` (new or extend): 5 tests — 200-silent, 404-DEBUG-no-throw, 5xx-throws, 401-throws, network-error-throws. + +### Integration / static-guard tests +- [ ] `tests/unit/gadgets/sessionState-and-ack-comments.integration.test.ts` (optional): 1 test — gadget-then-hook produces exactly one DELETE. + +### End-to-end / acceptance +- Spec ACs #7-#9 are fully covered by the unit tests above. AC #10's CI-verifiable proxy is the absence of any path in the test suite that produces a `Failed to delete progress comment after agent success` WARN under the gadget-then-hook flow. + +--- + +## Manual Verification (for `[manual]`-tagged ACs only) + +n/a — all ACs auto-tested. + +(Spec AC #10 has a post-deploy log-volume component, but the CI-verifiable proxy in this plan — "no `Failed to delete progress comment after agent success` WARN is emitted on the routine gadget-then-hook path" — is testable via the per-component and the integration tests. The 24h-volume measurement is operator-side after merge, tracked in the spec's overall AC #10 closure.) + +--- + +## Acceptance Criteria (per-plan, testable) + +1. `SessionStateData` has an `initialCommentIdConsumed: boolean` field. Default `false`. Reset semantics match the existing session-state lifecycle. +2. The gadget's `deleteInitialComment` sets `initialCommentIdConsumed = true` on successful delete OR HTTP 404 response. On other failures, the flag stays `false` (allowing the post-agent hook to retry). +3. The post-agent hook `deleteProgressCommentOnSuccess` skips entirely when `initialCommentIdConsumed === true`, regardless of whether `agentInput.ackCommentId` is populated. +4. The post-agent hook's legacy fallback to `agentInput.ackCommentId` still works for code paths that never populate session state (regression pin: paths where session state was never set continue to delete via the fallback). +5. The GitHub client's `deletePRComment` treats HTTP 404 as success: returns without throwing, logs at DEBUG with the comment id. Other HTTP errors (5xx, 401, 4xx-other-than-404) and non-HTTP errors continue to throw. +6. After the fixes, the routine gadget-then-hook flow produces exactly one DELETE (via the gadget). The post-agent hook does not issue a redundant call. +7. All new/modified code has corresponding tests. +8. `npm run typecheck` passes. +9. `npm test` (full unit suite) passes. +10. `npm run lint` passes. + +--- + +## Documentation Impact (this plan only) + +| File | Change | +|---|---| +| `CHANGELOG.md` | Entry: "Progress-comment lifecycle: post-agent cleanup hook now skips when an in-run gadget has already deleted the comment, eliminating ~72 redundant DELETEs (and 404 WARN log entries) per day on cascade-router. The GitHub client's `deletePRComment` also treats HTTP 404 as success, providing defense in depth." | + +(No `CLAUDE.md` or `src/integrations/README.md` change. Lifecycle detail is plan-level per the spec's doc-impact rubric.) + +--- + +## Out of Scope (this plan) + +- Failure mode A (PM-ack dispatch coverage) — Plan 1. +- Failure mode B (capacity-gate PM scope) — Plan 2. +- Removing the legacy `agentInput.ackCommentId` fallback entirely. The fallback is preserved (gated on `!consumed`) so older code paths that never populated session state continue to work. +- Reworking the gadget's session-state API surface beyond adding the consumed flag. The existing `deleteInitialComment` keeps its signature. +- Generalising the 404-as-success-on-DELETE policy to other HTTP DELETE call sites (e.g. PM ack delete in `src/router/acknowledgments.ts`). Only `deletePRComment` is migrated here. Other DELETE call sites get the same treatment in a separate spec if needed. +- Retry behavior for non-404 errors. The fix is "404 is success", not "5xx is retried". +- Stray org-level webhooks against unprovisioned repos (operator-side cleanup, see spec Out-of-Scope list). + +--- + +## Progress + + +- [ ] AC #1 +- [ ] AC #2 +- [ ] AC #3 +- [ ] AC #4 +- [ ] AC #5 +- [ ] AC #6 +- [ ] AC #7 +- [ ] AC #8 +- [ ] AC #9 +- [ ] AC #10 diff --git a/docs/plans/017-router-silent-failure-hardening/_coverage.md b/docs/plans/017-router-silent-failure-hardening/_coverage.md new file mode 100644 index 00000000..9d59bebe --- /dev/null +++ b/docs/plans/017-router-silent-failure-hardening/_coverage.md @@ -0,0 +1,54 @@ +# Coverage map for spec 017-router-silent-failure-hardening + +Auto-generated by /plan. Tracks which plans satisfy which spec ACs. + +## Spec ACs + +| # | Spec AC (short) | Satisfied by | Status | +|---|---|---|---| +| 1 | Linear-based PM-focused agent posts visible ack comment | plan 1 (pm-ack-coverage) | full | +| 2 | CI fails when adding a PM provider not wired through dispatch | plan 1 | full | +| 3 | `Unknown PM type` WARN → Sentry-captured error on routine path | plan 1 | full | +| 4 | Capacity gate enforces M of N at the limit (canonical incident) | plan 2 (capacity-gate-pm-scope) | full | +| 5 | CI fails when a new PM router adapter omits PM-provider scope wrap | plan 2 | full | +| 6 | `pipeline-capacity-gate: PM provider unavailable` WARN → Sentry | plan 2 | full | +| 7 | No double-delete on gadget-cleared progress comment | plan 3 (progress-comment-double-delete) | full | +| 8 | Legacy fallback (no session state) still works | plan 3 | full | +| 9 | 404-on-DELETE downgraded to DEBUG, breadcrumb preserved | plan 3 | full | +| 10 | 24h log volume of all three WARNs drops below 1/24h on prod | plan 1 + plan 2 + plan 3 | partial chain (post-deploy) | + +## Coverage summary + +- **10 spec ACs** mapped to **3 plans** +- **9 spec ACs** with full single-plan coverage +- **1 spec AC** (#10) with partial-chain coverage across all three plans — each plan contributes the elimination of its own WARN line on the routine path; final 24h-volume verification is operator-side after deploy. The CI-verifiable proxy in each plan is "this plan's WARN message no longer fires under standard test fixtures". + +## Plan dependency graph + +``` +plan 1 (pm-ack-coverage) independent +plan 2 (capacity-gate-pm-scope) independent +plan 3 (progress-comment-double-delete) independent +``` + +The three failure modes were chosen for decomposition precisely because they touch disjoint code areas, address independent failure modes, and can be reverted independently. Sequencing is the implementer's call. Recommended order if shipping serially: plan 3 first (lowest risk, smallest blast radius — pure log-noise reduction), plan 1 second (medium risk, restores missing UX feedback on Linear), plan 2 last (medium risk, the only behavioral change is fail-closed semantics on a previously-failing-open gate). All three can also ship in any other order; none blocks another. + +## Documentation impact distribution + +| Spec doc | Plan | What gets added | +|---|---|---| +| `src/integrations/README.md` | plan 1 | "PM-ack dispatch coverage" subsection under "Conformance harness — what CI enforces". Documents: every registered manifest must be reachable from the consolidated dispatch helper; dispatch must not branch on `pmType` literal strings; conformance harness is the regression net. | +| `CLAUDE.md` | plan 2 | "Capacity-gate invariant" subsection under the existing "PM Integration Architecture" pointer. Documents: every PM router adapter must wrap dispatch in PM-provider `AsyncLocalStorage` scope; fail-closed policy on scope miss; static guard at `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts`. | +| `CHANGELOG.md` | plan 1 | "PM-ack dispatch consolidation: Linear-based PM-focused agents now post their PM-side ack comment via a registry-consuming helper. Eliminates the silent skip on Linear projects." | +| `CHANGELOG.md` | plan 2 | "Pipeline-capacity gate now enforces `maxInFlightItems` for PM `status-changed` triggers (Linear/Trello/JIRA). Previously the gate was silently no-op on the PM-source path." | +| `CHANGELOG.md` | plan 3 | "Progress-comment lifecycle: post-agent cleanup hook now skips when an in-run gadget has already deleted the comment. The GitHub client's `deletePRComment` also treats HTTP 404 as success, providing defense in depth." | + +## Sentry tags introduced + +Following the spec-015 `wedged_lock_canary` precedent, three new stable Sentry tags are introduced (one per plan), all under `tags.source`: + +| Tag | Plan | When it fires | +|---|---|---| +| `pm_ack_unknown_pm_type` | plan 1 | A project's `pm.type` is not in the manifest registry. Indicates configuration error. | +| `pipeline_capacity_gate_no_pm_provider` | plan 2 | The capacity gate cannot resolve a PM provider in scope after Plan 2's adapter wrapping is in place. Indicates a real `AsyncLocalStorage` scope leak in a code path other than the routine PM-router-adapter dispatch. | +| _(none)_ | plan 3 | Plan 3 does not introduce a new Sentry tag — its 404-on-DELETE branch logs at DEBUG, not Sentry-capture, since 404 is success-equivalent under RFC-7231 idempotency. | diff --git a/docs/specs/017-router-silent-failure-hardening.md b/docs/specs/017-router-silent-failure-hardening.md new file mode 100644 index 00000000..a543a43a --- /dev/null +++ b/docs/specs/017-router-silent-failure-hardening.md @@ -0,0 +1,182 @@ +--- +id: 017 +slug: router-silent-failure-hardening +level: spec +title: Router-side silent-failure hardening +created: 2026-04-29 +status: draft +--- + +# 017: Router-side silent-failure hardening + +## Problem & Motivation + +A 24-hour audit of cascade-router production logs and webhook decision history on 2026-04-29 surfaced three distinct silent-failure modes producing 128 combined WARN entries per day with no functional escalation to operators. Each represents a different class of degradation — missing UX feedback on the PM card, a security-gate bypass, log noise that masks real signal — but they share a structural pattern: the router code path "succeeds" from its caller's perspective while a behavioral guarantee is silently broken, and the operator's only signal is a WARN line buried in normal traffic. + +The audit was triggered by the same investigation that produced PR #1220 (`respond-to-review` and `respond-to-pr-comment` runs persisting NULL `agent_runs.work_item_id`, hiding them from the dashboard's work-item page). After fixing that bug, a sweep of the previous 24 hours of cascade-router logs surfaced these three additional silent failures, each independently actionable and each rooted in either parallel-path drift between two helpers that should have been one, or a missing wrapper around an `AsyncLocalStorage` scope. + +This spec consolidates the three fixes under one motivation. They are independent enough to ship as separate plans/PRs but related enough that solving them together — with shared regression-prevention strategy and shared "silent fail no longer permitted" policy — is cheaper than three separate spec rounds. + +### Failure mode A — Linear PM-ack silently skipped on PM-focused agents + +24 occurrences in 24h. PM-focused agents (e.g. `backlog-manager`) triggered from a GitHub webhook need to post their acknowledgment comment to the PM tool, not the GitHub PR. The router-side helper that dispatches that ack handles only Trello and JIRA; for Linear-based projects, control flow falls through to an "unknown PM type" branch, no ack is posted, and a `WARN: Unknown PM type for PM-focused agent ack, skipping` is logged. A parallel helper elsewhere in the codebase already has the Linear branch — this is missed-migration drift, the same shape of bug as PR #1220. User-visible impact: Linear-based projects (the most active being `ucho`) never see the "agent is working on it" comment that Trello/JIRA-based projects get on the same agent type. The silent skip masks any future case where a PM type is added to the registry without being wired through every ack-posting site. + +### Failure mode B — Pipeline-capacity gate fails open on every PM `status-changed` trigger + +32 occurrences in 24h. The pipeline-capacity gate is a hard cap on the active pipeline (TODO + IN_PROGRESS + IN_REVIEW work items) introduced in response to a prior incident where three implementation runs fired concurrently against a project pinned at `maxInFlightItems: 1` after a human moved three cards into TODO simultaneously. The gate calls a PM-provider getter from within `AsyncLocalStorage` to count in-flight items; on scope miss it conservatively returns "allow" and emits `WARN: pipeline-capacity-gate: PM provider unavailable, allowing run`. Today, that conservative branch fires on every PM-source `status-changed` trigger because the PM router adapters wrap dispatch in PM credentials but not in the PM-provider scope. The GitHub router adapter does both correctly. So the gate's only consumer dispatches outside the scope it requires, the gate is silently no-op for the only triggers that need it, and `maxInFlightItems` is essentially unenforced for the entire PM-source path. User-visible impact: a human moving N work items into TODO simultaneously still produces N parallel implementation runs, exactly the scenario the gate was added to prevent. + +### Failure mode C — Progress-comment double-delete race produces 404 log spam + +72 occurrences in 24h. The router's post-agent success hook deletes the GitHub progress/ack comment for non-implementation agents. Separately, in-run gadgets (notably the create-PR-review gadget) delete the same comment mid-run when they have a more contextually-appropriate moment to clean it up, then clear the comment id from session state. The post-agent hook reads session state but falls back to the agent-input copy of the comment id when session state is empty — and "session state cleared by the gadget" is indistinguishable from "session state never populated", so the fallback fires and re-deletes the already-deleted comment. GitHub returns 404, `WARN: Failed to delete progress comment after agent success` is logged, and the operator triaging real failures has 72 false-positive entries to filter out per day. There is no functional impact (the comment IS deleted, just by the gadget rather than the hook), but the log noise dominates WARN volume on cascade-router and obscures the failures that DO matter. + +--- + +## Goals + +1. PM-focused agents triggered from a GitHub webhook post their acknowledgment comment on the PM-side work item for every PM type registered in the manifest registry — Linear included, plus any future provider — without per-PM-type literal-string branching in the dispatch helper. +2. The pipeline-capacity gate enforces `maxInFlightItems` for PM `status-changed` triggers exactly as it does for GitHub-source triggers today: when a human moves N cards into TODO simultaneously and the project's cap is M, no more than M implementation runs proceed. +3. After an in-run gadget deletes the progress comment and signals consumption, the post-agent success hook does not issue a redundant DELETE against GitHub for the same comment id; 24h log volume of `Failed to delete progress comment after agent success` drops to near-zero under normal operation. +4. Each of the three failure modes converts from "silent WARN that operators learn to ignore" to "loud Sentry-captured error that operators can act on" once the routine path is fixed, so the WARN-vs-real-failure ratio in cascade-router logs drops to a level where operator triage stays fast. +5. A static guard or conformance-harness extension prevents future PM provider additions or future PM router-adapter additions from regressing any of the three invariants without a CI failure. + +--- + +## Non-goals + +- Migrating any parallel-helper code paths beyond the two specifically identified in failure mode A. If a third-instance audit surfaces another duplicate dispatch helper, track separately. +- Refactoring the GitHub router adapter's existing `dispatchWithCredentials` shape. The wrapping there is correct and load-bearing. The shared helper introduced for PM adapters is additive and does not reshape the GitHub side. +- Changing the user-visible message content or formatting of PM-side ack comments. The fix is "the comment is posted at all on Linear" — message wording is unchanged. +- Adding a dashboard view of router log noise or a generalized log-noise budget. Operator-side observability tooling is a separate effort. +- Backfilling Linear-side ack comments for past PM-focused agent runs. Runs that already finished without their ack comment stay finished. +- Reworking the capacity-gate threshold itself or the in-flight counting query. The gate's logic is correct; the bug is that it never runs in scope. +- Tightening the warn-vs-error policy for unrelated WARN call sites in cascade-router. Only the three identified here are in scope. + +--- + +## Constraints + +- The PM-ack consolidation must not change the wire shape of the manifest registry's `platformClientFactory` or any provider's adapter API. Dispatch consumes those interfaces; adapters do not change. +- The capacity-gate adapter wrapping must not introduce a per-request HTTP round-trip to resolve the PM provider. Provider resolution happens in process via the manifest registry; the wrapping just establishes `AsyncLocalStorage` scope for the duration of trigger dispatch. +- The progress-comment lifecycle change must be backward-compatible with paths that never populate session state — the legacy fallback to the agent-input copy of the comment id continues to work in those cases. +- All three fixes ship as independent plans/PRs that can be merged in any order. None blocks another. +- Sentry capture additions follow the existing tag-naming conventions on cascade-router (per the spec-015 `wedged_lock_canary` precedent). New tag names go through the same review the existing tags went through. +- After all three fixes are deployed, 24h log volume on cascade-router for the three identified WARN messages drops below an operator-defined noise floor (target: < 1 occurrence per 24h under normal operation, with anything above representing a genuine anomaly worth investigating). + +--- + +## Requirements + +Grouped by failure mode. + +### PM-ack coverage (failure mode A) + +A1. Every PM type registered in the manifest registry is reachable from the router-side PM-ack dispatch path that runs after PM-focused agents triggered by a GitHub webhook complete. Adding a new PM provider to the registry without editing the dispatch path must not regress this invariant. + +A2. The dispatch path that today exists in two near-identical copies (one in the router's GitHub adapter, one in the shared trigger helpers) consolidates into a single helper. The helper consumes the manifest registry directly and does not branch on `pmType` literal strings. + +A3. When the consolidated helper encounters a PM type that is genuinely not in the registry (configuration error or a project pinned to a deleted provider), the call site emits an error-level log, captures to Sentry under a stable tag, and skips. Silent warn-and-skip is removed from this path. + +### Capacity-gate scope (failure mode B) + +B1. Every PM router adapter (Trello, JIRA, Linear, plus any future PM router adapter) wraps trigger dispatch in both its credential `AsyncLocalStorage` scope AND the PM-provider `AsyncLocalStorage` scope. Adding a new PM router adapter without wrapping in PM-provider scope must produce a CI failure. + +B2. The capacity gate, when called from inside the wrapping, finds a PM provider in scope and runs its in-flight-count check. The conservative "allow when no provider" branch exists only for genuinely-unscoped contexts (e.g. a future caller from a different surface) and is treated as an error there too. + +B3. End-to-end behavior under the original incident scenario: a human moves N work items into TODO simultaneously against a project with `maxInFlightItems: M` (M < N). Exactly M implementation runs proceed; the rest are blocked at the gate with an info-level `pipeline-at-capacity` decision log. + +B4. After deployment, hitting the "PM provider unavailable" branch on the routine path produces a Sentry-captured error, not a steady-state warn. The operator alarm is real. + +### Progress-comment lifecycle (failure mode C) + +C1. After an in-run gadget deletes the progress/ack comment and signals consumption, the post-agent success hook does not issue a second DELETE against the GitHub API for the same comment id. + +C2. The legacy fallback chain — "use session state if populated, else use the agent-input copy of the comment id" — continues to work for paths that never populate session state. The fallback is gated explicitly on "the comment has not already been consumed" rather than "session state happens to be empty." + +C3. The post-agent hook is idempotent at the API layer too: if a 404 is returned (comment already deleted by any path, including external manual deletion), the response is logged at DEBUG and not WARN. Other HTTP errors (5xx, auth, throttling) continue to log at WARN. + +C4. After deployment, the 24h volume of `Failed to delete progress comment after agent success` on cascade-router drops below the operator noise floor for normal operation. + +--- + +## Research Notes + +- The cascade codebase already establishes precedent for both prevention patterns. The PM manifest conformance harness (introduced in spec 009 as the sanctioned regression-prevention mechanism for parallel-path drift in PM dispatch) is the obvious extension point for failure mode A's static guard. The single-entrypoint cross-surface test is the sanctioned mechanism for "every router surface registers integrations consistently"; failure mode B's adapter-wrapping invariant is conceptually adjacent and may extend the same test or sit alongside it. +- `AsyncLocalStorage` scope-leakage is a well-documented Node.js failure mode; the mitigation (always wrap dispatch in the same scope the inner function reads from) is standard and is already done correctly in the GitHub router adapter. The PM router adapters are the outliers. +- Idempotency-on-DELETE for HTTP APIs is RFC-7231 baseline behavior; treating 404 on DELETE as success-equivalent is established practice across cloud SDKs (AWS SDK retry middleware, Octokit's own request retry behavior, Stripe's idempotency model). The DEBUG-not-WARN downgrade for 404 on DELETE in failure mode C aligns with that convention. +- Spec 015 (router job dispatch failure recovery) introduced the convention that silent failures in cascade-router must be replaced with grep-stable structured log lines AND must escalate to Sentry where appropriate. All three fixes here follow that convention. The `wedged_lock_canary` tag from spec 015 is the model for the new Sentry tags introduced here (one per failure mode). +- No academic prior art is cited because all three fixes are well-understood engineering hygiene applied to existing infrastructure. The relevant prior art is the cascade codebase's own precedents. + +--- + +## Open Source Decisions + +| Tool | Solves | Decision | Reason | +|------|--------|----------|--------| +| _(none)_ | _(none)_ | _(none)_ | Three hardening fixes against existing infrastructure. The conformance-harness pattern, `AsyncLocalStorage`, and Sentry capture are all already in use. The work is using them correctly, not adopting new ones. | + +--- + +## Strategic decisions + +1. **PM-ack consolidation over minimal patch.** Chose to merge the two duplicate dispatch helpers into a single registry-consuming path rather than just add the missing Linear branch. The duplication is what created the drift; same effort kills the bug class for any future PM provider. Alternative considered: minimal patch (faster to ship, leaves the second copy as a future-bug surface). + +2. **Conformance-harness extension over TypeScript exhaustive switch for the PM-ack invariant.** Chose to extend the existing PM manifest conformance harness to assert every registered manifest is reachable from the consolidated PM-ack dispatch path. Alternative considered: discriminated-union exhaustive switch on `PMType` literal. The harness is the established cascade pattern, catches "registry vs dispatch" drift without type-system gymnastics, and naturally accommodates future providers. + +3. **Shared adapter helper over per-adapter wrapping for capacity-gate scope.** Chose to lift the credential + PM-provider scope wrapping into a shared helper that PM router adapters consume, rather than asking each adapter to wrap independently. Same effort, makes the invariant self-documenting, and prevents future PM router adapters from making the same omission. + +4. **Fail-closed for capacity-gate on PM-provider miss.** Chose to block runs and emit error + Sentry capture when the gate cannot find a PM provider in scope, rather than continue allowing-with-warn for backward compat. Once the routine path establishes scope, hitting the miss branch is a real anomaly. The gate's purpose is capacity protection; failing open silently re-introduces the original incident class. + +5. **Explicit boolean flag for progress-comment consumption state.** Chose a named boolean on session state (e.g. `initialCommentIdConsumed: true`), separate from the comment-id field. Alternatives considered: sentinel value on the existing id field (null/undefined ambiguity), consume-once getter that atomically clears (more invasive change to session state API). The boolean is clearest, most testable, and lowest-risk. + +6. **Gate the legacy fallback on the consumed-flag, not remove it.** Chose to keep the existing fallback to the agent-input copy of the comment id but gate it on "not yet consumed". Removing the fallback entirely would force every code path that creates an ack comment to populate session state correctly — more invasive than the bug warrants and risks breaking older paths. + +7. **Defense in depth: 404-on-DELETE downgraded to DEBUG.** In addition to the state-machine fix in #5/#6, the API-layer call also treats 404 on DELETE as success-equivalent and logs at DEBUG. Belt-and-suspenders; if a future regression of similar shape recurs, the WARN log volume stays clean while a DEBUG breadcrumb still exists for audit. Other HTTP errors continue to log at WARN. + +8. **Three independent plans downstream.** Chose to decompose this spec into three plans corresponding to the three failure modes rather than bundle into one plan. Each plan can be reviewed, merged, and reverted independently; none blocks another. Sequencing is deferred to `/plan` rather than fixed in this spec. + +--- + +## Acceptance Criteria (outcome-level) + +PM-ack coverage: + +1. A PM-focused agent (e.g. `backlog-manager`) triggered from a GitHub webhook on a Linear-based project produces a visible acknowledgment comment on the Linear work item with the same content shape that Trello and JIRA projects get today. + +2. Adding a new PM provider to the manifest registry without wiring it through the consolidated PM-ack dispatch path produces a CI failure with a message that names the missing provider and the dispatch path that needs updating. + +3. The `Unknown PM type for PM-focused agent ack, skipping` WARN message is no longer emitted on the routine path. If it does emit, it represents a genuine misconfiguration that operators should investigate, and is captured to Sentry under a stable tag. + +Capacity-gate scope: + +4. Under the canonical incident scenario (a human moves N work items into TODO simultaneously against a project with `maxInFlightItems: M < N`), exactly M implementation runs proceed and the remaining N-M trigger evaluations log an info-level `pipeline-at-capacity` decision. + +5. Adding a new PM router adapter to the codebase without establishing PM-provider scope around dispatch produces a CI failure that names the missing wrapping. + +6. The `pipeline-capacity-gate: PM provider unavailable, allowing run` WARN message is no longer emitted on the routine path. If it does emit, it represents a real scope leak and is captured to Sentry under a stable tag. + +Progress-comment lifecycle: + +7. After the create-PR-review gadget deletes the progress comment mid-run, the post-agent success hook does not call DELETE on the same comment id. No 404 from the GitHub comments API is generated by cascade-router on this path. + +8. A code path that creates a progress comment but never populates session state (legacy or future edge case) still has its progress comment deleted by the post-agent hook via the agent-input fallback. + +9. If the GitHub comments DELETE API returns 404 from any path (e.g. a user manually deleted the comment), the response logs at DEBUG, not WARN, and a single Sentry breadcrumb is preserved for audit without escalating. + +10. After deployment, the 24h volume of the three identified WARN messages on cascade-router drops below 1 occurrence per 24h under normal operation. Anything above that threshold is investigable as a real anomaly rather than steady-state noise. + +--- + +## Documentation Impact (high-level) + +- `src/integrations/README.md` — add a "PM-type dispatch coverage invariant" callout in the section that explains how a new PM provider is registered, pointing at the conformance harness as the regression net for PM-ack dispatch. +- `CLAUDE.md` — add a small subsection adjacent to the existing "PM Integration Architecture" pointer describing the capacity-gate invariant: "Every PM router adapter must wrap trigger dispatch in PM-provider `AsyncLocalStorage` scope, otherwise the in-flight cap is silently disabled. This is enforced by the capacity-gate scope test." This is a load-bearing operator-facing rule that does not derive from reading the code in isolation. + +--- + +## Out of Scope + +- The stray org-level GitHub webhook traffic that the same audit also surfaced (~11 unprovisioned-repo PR webhooks in the sampled window). That is operator-side cleanup at the GitHub side, not a cascade code change. +- The Claude-Code 401 `authentication_failed` errors observed during the audit (~3 in 24h). Token rotation / persona configuration is operator-side. +- The Anthropic group usage cap of $0 observed during the audit. Account-side configuration is operator-side. +- The 25-of-203 `review`-agent runs with NULL `agent_runs.work_item_id`. Likely a PR↔work-item linking race, separate spec when chased. +- Watchdog timeouts observed during the audit (2 in 24h). Two long-running agents hit the watchdog and were force-exited; per-agent investigation is separate. From 26717c38af3ec232eff37eded7fe450a6fdc3e69 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:22:18 +0000 Subject: [PATCH 06/22] chore(plan): 017/3 lock --- ...double-delete.md => 3-progress-comment-double-delete.md.wip} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/plans/017-router-silent-failure-hardening/{3-progress-comment-double-delete.md => 3-progress-comment-double-delete.md.wip} (99%) diff --git a/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md b/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.wip similarity index 99% rename from docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md rename to docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.wip index 334b7021..a35d9251 100644 --- a/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md +++ b/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.wip @@ -6,7 +6,7 @@ plan_slug: progress-comment-double-delete level: plan parent_spec: docs/specs/017-router-silent-failure-hardening.md depends_on: [] -status: pending +status: wip --- # 017/3: Progress-comment double-delete race From ddb90ea425649a817193574a011462ffe961bc2a Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Wed, 29 Apr 2026 10:25:48 +0000 Subject: [PATCH 07/22] test: add coverage for buildExecutionPlan in secretOrchestrator --- .../unit/backends/secretOrchestrator.test.ts | 148 +++++++++++++++++- 1 file changed, 147 insertions(+), 1 deletion(-) diff --git a/tests/unit/backends/secretOrchestrator.test.ts b/tests/unit/backends/secretOrchestrator.test.ts index 923fca17..74e76f2f 100644 --- a/tests/unit/backends/secretOrchestrator.test.ts +++ b/tests/unit/backends/secretOrchestrator.test.ts @@ -1,14 +1,75 @@ +/* biome-ignore lint/suspicious/noExplicitAny: test mocks */ import { beforeEach, describe, expect, it, vi } from 'vitest'; vi.mock('../../../src/utils/runLink.js', () => ({ getDashboardUrl: vi.fn(), })); -import { injectRunLinkSecrets } from '../../../src/backends/secretOrchestrator.js'; +// Mock everything that buildExecutionPlan might call +vi.mock('../../../src/agents/shared/modelResolution.js', () => ({ + resolveModelConfig: vi.fn().mockResolvedValue({ + systemPrompt: 'system', + taskPrompt: 'task', + model: 'claude', + maxIterations: 10, + }), +})); + +vi.mock('../../../src/agents/shared/promptContext.js', () => ({ + buildPromptContext: vi.fn().mockReturnValue({}), +})); + +vi.mock('../../../src/db/repositories/partialsRepository.js', () => ({ + loadPartials: vi.fn().mockResolvedValue(new Map()), +})); + +vi.mock('../../../src/sentry/integration.js', () => ({ + getSentryIntegrationConfig: vi.fn(), +})); + +vi.mock('../../../src/agents/definitions/profiles.js', () => ({ + getAgentProfile: vi.fn().mockReturnValue({ + fetchContext: vi.fn().mockResolvedValue({}), + finishHooks: {}, + filterTools: vi.fn().mockReturnValue([]), + }), +})); + +vi.mock('../../../src/agents/definitions/toolManifests.js', () => ({ + getToolManifests: vi.fn().mockReturnValue([]), +})); + +vi.mock('../../../src/backends/registry.js', () => ({ + isNativeToolEngineDefinition: vi.fn().mockReturnValue(false), +})); + +vi.mock('../../../src/agents/definitions/index.js', () => ({ + needsGitStateStopHooks: vi.fn().mockReturnValue(false), +})); + +vi.mock('../../../src/backends/secretBuilder.js', () => ({ + augmentProjectSecrets: vi.fn().mockResolvedValue({}), + resolveGitHubToken: vi.fn(), + injectGitHubAckCommentId: vi.fn(), + injectProgressCommentId: vi.fn(), +})); + +vi.mock('../../../src/backends/sidecarManager.js', () => ({ + createCompletionArtifacts: vi.fn().mockReturnValue({}), +})); + +import { buildPromptContext } from '../../../src/agents/shared/promptContext.js'; +import { + buildExecutionPlan, + injectRunLinkSecrets, +} from '../../../src/backends/secretOrchestrator.js'; +import { getSentryIntegrationConfig } from '../../../src/sentry/integration.js'; import type { ProjectConfig } from '../../../src/types/index.js'; import { getDashboardUrl } from '../../../src/utils/runLink.js'; const mockGetDashboardUrl = vi.mocked(getDashboardUrl); +const mockGetSentryIntegrationConfig = vi.mocked(getSentryIntegrationConfig); +const mockBuildPromptContext = vi.mocked(buildPromptContext); function makeProject(overrides?: Partial): ProjectConfig { return { @@ -24,6 +85,91 @@ function makeProject(overrides?: Partial): ProjectConfig { beforeEach(() => { mockGetDashboardUrl.mockReturnValue(undefined); + vi.clearAllMocks(); +}); + +describe('buildExecutionPlan', () => { + it('fetches sentry config for alerting agent', async () => { + mockGetSentryIntegrationConfig.mockResolvedValueOnce({ + organizationSlug: 'org', + resultsContainerId: 'sentry-container-123', + }); + + const project = makeProject(); + await buildExecutionPlan( + 'alerting', + { project, config: {}, triggerType: 'sentry:issue-created' } as unknown as any, + '/repo', + {} as unknown as any, + {} as unknown as any, + 'token', + false, + 'claude-code', + {} as unknown as any, + ); + + expect(mockGetSentryIntegrationConfig).toHaveBeenCalledWith('test-project'); + expect(mockBuildPromptContext).toHaveBeenCalledWith( + undefined, + project, + 'sentry:issue-created', + undefined, + undefined, + 'sentry-container-123', + ); + }); + + it('does not fetch sentry config for non-alerting agent', async () => { + const project = makeProject(); + await buildExecutionPlan( + 'implementation', + { project, config: {}, triggerType: 'manual' } as unknown as any, + '/repo', + {} as unknown as any, + {} as unknown as any, + 'token', + false, + 'claude-code', + {} as unknown as any, + ); + + expect(mockGetSentryIntegrationConfig).not.toHaveBeenCalled(); + expect(mockBuildPromptContext).toHaveBeenCalledWith( + undefined, + project, + 'manual', + undefined, + undefined, + undefined, + ); + }); + + it('handles sentry config failure gracefully', async () => { + mockGetSentryIntegrationConfig.mockRejectedValueOnce(new Error('DB failure')); + + const project = makeProject(); + await buildExecutionPlan( + 'alerting', + { project, config: {}, triggerType: 'sentry:issue-created' } as unknown as any, + '/repo', + {} as unknown as any, + {} as unknown as any, + 'token', + false, + 'claude-code', + {} as unknown as any, + ); + + expect(mockGetSentryIntegrationConfig).toHaveBeenCalledWith('test-project'); + expect(mockBuildPromptContext).toHaveBeenCalledWith( + undefined, + project, + 'sentry:issue-created', + undefined, + undefined, + undefined, + ); + }); }); describe('injectRunLinkSecrets', () => { From 8a7261655061d7dac087fc8d956da88df8b9e2e5 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:28:44 +0000 Subject: [PATCH 08/22] fix(triggers): suppress redundant progress-comment DELETE after gadget mid-run cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan 017/3 (progress-comment-double-delete). Closes failure mode C from spec 017's 24h log audit on 2026-04-29. Root cause: deleteProgressCommentOnSuccess (post-agent hook) read sessionState.initialCommentId, fell back to result.agentInput.ackCommentId when session state was empty, and issued a DELETE. But "session state cleared by an in-run gadget" was indistinguishable from "session state never populated", so the fallback fired and re-deleted comments that gadgets had already disposed of mid-run. GitHub returned 404 and produced 72 WARN entries per day on cascade-router (no functional impact — the comment WAS deleted — but the noise dominated WARN volume and obscured real failures). Three changes: 1. New `initialCommentIdConsumed: boolean` flag on SessionStateData (default false). Both `deleteInitialComment` (gadget-driven) and `clearInitialComment` (sidecar-driven) set it to true after disposing of the comment. Distinguishes "had a comment, now gone" from "never had one". 2. `deleteProgressCommentOnSuccess` reads the flag first; if true, skips the entire deletion path including the legacy `agentInput.ackCommentId` fallback. The legacy fallback continues to work for paths that never populated session state (consumed stays false). 3. `githubClient.deletePRComment` treats HTTP 404 as success-equivalent under RFC-7231 idempotency: returns without throwing, logs at DEBUG with the comment id. Other HTTP errors (5xx, 401, network) continue to throw. Defense in depth: even if a future regression of similar shape recurs, the WARN volume stays clean while a DEBUG breadcrumb persists for audit. Test coverage: +7 tests in sessionState (new `initialCommentIdConsumed flag` describe block), +3 in ack-comments (consumed-bypass + headline regression pin + legacy-fallback preservation), +4 in client (404 → DEBUG + 5xx/401/network rejection regression pins). Full suite: 470 files / 8649 tests passing (+14 from baseline). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + ... 3-progress-comment-double-delete.md.done} | 22 +++--- src/gadgets/sessionState.ts | 21 ++++- src/github/client.ts | 27 +++++-- src/triggers/github/ack-comments.ts | 9 ++- tests/unit/gadgets/sessionState.test.ts | 79 +++++++++++++++++++ tests/unit/github/client.test.ts | 55 +++++++++++++ .../unit/triggers/github/ack-comments.test.ts | 58 ++++++++++++++ 8 files changed, 254 insertions(+), 18 deletions(-) rename docs/plans/017-router-silent-failure-hardening/{3-progress-comment-double-delete.md.wip => 3-progress-comment-double-delete.md.done} (92%) diff --git a/CHANGELOG.md b/CHANGELOG.md index c57b92da..f016d67f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable user-visible changes to CASCADE are documented here. The format is l ### Changed +- **Progress-comment lifecycle: post-agent cleanup hook now skips when an in-run gadget already deleted the comment** (spec 017, plan 3 of 3). The post-agent `deleteProgressCommentOnSuccess` hook used to read `sessionState.initialCommentId`, fall back to `result.agentInput.ackCommentId` when session state was empty, and issue a redundant DELETE — but "session state cleared by a gadget" was indistinguishable from "session state never populated", so the fallback fired and re-deleted comments that were already gone. GitHub returned 404 and `WARN: Failed to delete progress comment after agent success` was logged 72 times per day on cascade-router (live audit on 2026-04-29). Adds an explicit `initialCommentIdConsumed: boolean` flag on `SessionStateData`. Both `deleteInitialComment` (gadget-driven) and `clearInitialComment` (sidecar-driven) now set the flag to `true` after disposing of the comment. The post-agent hook checks the flag first and skips the entire deletion path — including the legacy `agentInput.ackCommentId` fallback — when consumed. As defense in depth, `githubClient.deletePRComment` now treats HTTP 404 as success (RFC-7231 idempotency) and logs at DEBUG instead of letting the error bubble as a WARN; other HTTP errors (5xx, 401, network) continue to throw. The legacy fallback to `agentInput.ackCommentId` continues to work for code paths that never populate session state. See [spec 017](docs/specs/017-router-silent-failure-hardening.md). - **PM image delivery: Linear GraphQL fixture + extraction-coverage regression test** (spec 016, plan 3 of 3). Captures a reconstructed Linear `Issue` GraphQL payload at `tests/fixtures/linear-issue-with-screenshot.json` containing extension-less and extensioned inline-pasted images (description + comment bodies) plus formal Attachment records (Slack/GitHub/Sentry link previews) that must NOT be mistaken for inline images. The unit test at `tests/unit/pm/linear/extraction-coverage.test.ts` pins the contract and fails loudly with a specific URL-missing message if Linear ever changes its payload shape in a way that loses inline images. Documents the conclusion in `src/integrations/README.md`: `Issue.description` markdown is canonical for Linear inline images; `Issue.attachments` is the wrong surface (formal Attachment records, not pastes). No production code change — this plan ships the regression net for the contract Plans 1+2 established. See [spec 016](docs/specs/016-pm-image-delivery-reliability.md). - **PM image delivery: runtime `cascade-tools pm read-work-item` gadget now delivers images on disk** (spec 016, plan 2 of 3). The runtime gadget that agents call mid-run for a work item used to return text only — its "Pre-fetched Images" section listed URL refs but no local file paths, so an agent that needed to re-read a work item (e.g. after a teammate added a screenshot) had no way to actually see the new image. After this plan, the gadget downloads any image media present and writes it to `.cascade/context/images/work-item--img-.` (extension derived from the resolved Content-Type MIME), then returns text whose new "Local Image Files" section lists actual file paths the agent's file-read tool can consume. Failed downloads are surfaced in a "Failed Image Downloads" subsection so they're never silently dropped. Same diagnostic log line as the boot path (`[image-pipeline] work-item-fetch summary`) — operators see consistent shape across boot and runtime fetches. Closes the mid-run pickup gap. See [spec 016](docs/specs/016-pm-image-delivery-reliability.md). - **PM image delivery: extension-less Linear pasted-image URLs are no longer dropped at the pre-download MIME filter** (spec 016, plan 1 of 3). Linear's `https://uploads.linear.app/` URLs (with no file extension in the pathname) used to fall through `mimeTypeFromUrl` to `application/octet-stream` and were silently filtered out by `filterImageMedia` before the download loop ran. The fix introduces an `image/*` wildcard sentinel for trusted PM-provider upload hosts (allowlisted by hostname); `isImageMimeType` now accepts the wildcard, and the download response's `Content-Type` header resolves it to a concrete MIME (`image/png`, etc.) before any image is written. The shared `downloadAndPrepareImages` helper consolidates the per-provider download dispatch (jira/linear/trello) so both the boot-path and the runtime gadget (spec 016 plan 2) share one code path. Adds AC#5's grep-stable diagnostic line — `[image-pipeline] work-item-fetch summary` — emitted once per work-item-fetch with stable fields (`provider`, `workItemId`, `urlsDetected`, `urlsAfterFilter`, `urlsDownloaded`, `urlsFailed`, `urlsByMimeType`). Closes the silent screenshot-drop bug class verified live on 2026-04-26 (ucho/MNG-357). See [spec 016](docs/specs/016-pm-image-delivery-reliability.md). diff --git a/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.wip b/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.done similarity index 92% rename from docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.wip rename to docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.done index a35d9251..1991c9df 100644 --- a/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.wip +++ b/docs/plans/017-router-silent-failure-hardening/3-progress-comment-double-delete.md.done @@ -6,7 +6,7 @@ plan_slug: progress-comment-double-delete level: plan parent_spec: docs/specs/017-router-silent-failure-hardening.md depends_on: [] -status: wip +status: done --- # 017/3: Progress-comment double-delete race @@ -198,13 +198,13 @@ n/a — all ACs auto-tested. ## Progress -- [ ] AC #1 -- [ ] AC #2 -- [ ] AC #3 -- [ ] AC #4 -- [ ] AC #5 -- [ ] AC #6 -- [ ] AC #7 -- [ ] AC #8 -- [ ] AC #9 -- [ ] AC #10 +- [x] AC #1 — `SessionStateData.initialCommentIdConsumed: boolean` exists, default `false` (verified by `tests/unit/gadgets/sessionState.test.ts > initialCommentIdConsumed flag > fresh state has initialCommentIdConsumed: false`). +- [x] AC #2 — `deleteInitialComment` sets consumed on 200/404, leaves it false on other errors (verified by sessionState tests "on success" and "on error"). +- [x] AC #3 — `deleteProgressCommentOnSuccess` skips entirely when consumed=true (verified by `tests/unit/triggers/github/ack-comments.test.ts > skips entirely when initialCommentIdConsumed=true` + the headline "consumed=true wins over agentInput.ackCommentId fallback" regression pin). +- [x] AC #4 — Legacy fallback preserved when consumed=false (verified by `consumed=false preserves the legacy fallback to agentInput.ackCommentId` test). +- [x] AC #5 — `deletePRComment` treats 404 as success (verified by `tests/unit/github/client.test.ts > treats HTTP 404 as success` + 5xx/401/network rejection regression pins). +- [x] AC #6 — Routine gadget-then-hook flow produces exactly one DELETE (covered by tests 3+5 in combination; no separate integration test required given per-component coverage). +- [x] AC #7 — All new/modified code has tests (sessionState +7 tests, ack-comments +3, client +4 = 14 new tests). +- [x] AC #8 — `npm run typecheck` passes (no errors). +- [x] AC #9 — `npm test` (full unit suite) passes (470 files / 8649 tests, +14 from prior baseline). +- [x] AC #10 — `npm run lint` passes (Biome clean). diff --git a/src/gadgets/sessionState.ts b/src/gadgets/sessionState.ts index 0d59ecb6..bb17390d 100644 --- a/src/gadgets/sessionState.ts +++ b/src/gadgets/sessionState.ts @@ -47,6 +47,14 @@ interface SessionStateData { reviewBody: string | null; reviewEvent: string | null; initialCommentId: number | null; + /** + * Set to `true` after a gadget mid-run delete (or sidecar-driven clear) has + * disposed of the initial ack comment. The post-agent cleanup hook reads + * this and skips its DELETE entirely, including the legacy fallback to + * `agentInput.ackCommentId`. Distinguishes "we never had a comment" (false) + * from "we had one but it's already gone" (true). + */ + initialCommentIdConsumed: boolean; } /** @@ -73,6 +81,7 @@ export class SessionState { reviewBody: null, reviewEvent: null, initialCommentId: null, + initialCommentIdConsumed: false, }; init(options: InitSessionStateOptions): void { @@ -105,6 +114,7 @@ export class SessionState { reviewBody: null, reviewEvent: null, initialCommentId: null, + initialCommentIdConsumed: false, }; } @@ -157,6 +167,9 @@ export class SessionState { */ clearInitialComment(): void { this.state.initialCommentId = null; + // Mark consumed so the post-agent callback skips even if agentInput + // still carries the original ackCommentId as a legacy fallback. + this.state.initialCommentIdConsumed = true; } /** @@ -176,8 +189,14 @@ export class SessionState { try { const { githubClient } = await import('../github/client.js'); await githubClient.deletePRComment(owner, repo, commentId); + // Mark consumed so the post-agent callback's legacy fallback to + // `agentInput.ackCommentId` does not re-issue a DELETE for the + // same id. `deletePRComment` swallows 404 internally, so reaching + // here without throwing covers both 200/204 and 404 outcomes. + this.state.initialCommentIdConsumed = true; } catch { - // Best-effort: restore the id so post-agent callback can retry + // Best-effort: restore the id so post-agent callback can retry. + // Consumed flag stays false — the comment may still be live. this.state.initialCommentId = commentId; } } diff --git a/src/github/client.ts b/src/github/client.ts index 00bacfe3..ea98769e 100644 --- a/src/github/client.ts +++ b/src/github/client.ts @@ -234,11 +234,28 @@ export const githubClient = { async deletePRComment(owner: string, repo: string, commentId: number): Promise { logger.debug('Deleting PR comment', { owner, repo, commentId }); - await getClient().issues.deleteComment({ - owner, - repo, - comment_id: commentId, - }); + try { + await getClient().issues.deleteComment({ + owner, + repo, + comment_id: commentId, + }); + } catch (err) { + // 404 is success-equivalent under RFC-7231 idempotency (the comment is gone). + // Any path — gadget mid-run delete, sidecar-driven clear, user manual delete — + // can have already removed the comment. The post-agent cleanup hook used to + // log this as a WARN 72 times/day in prod; downgrade to DEBUG so the noise + // doesn't drown out real failures while preserving an audit breadcrumb. + if ((err as { status?: number })?.status === 404) { + logger.debug('PR comment already deleted (404 on DELETE)', { + owner, + repo, + commentId, + }); + return; + } + throw err; + } }, async getPRReviews(owner: string, repo: string, prNumber: number): Promise { diff --git a/src/triggers/github/ack-comments.ts b/src/triggers/github/ack-comments.ts index bb15e8be..997cafa0 100644 --- a/src/triggers/github/ack-comments.ts +++ b/src/triggers/github/ack-comments.ts @@ -43,7 +43,14 @@ export async function deleteProgressCommentOnSuccess( const { getSessionState } = await import('../../gadgets/sessionState.js'); const sessionState = getSessionState(); - const { initialCommentId } = sessionState; + const { initialCommentId, initialCommentIdConsumed } = sessionState; + + // If a gadget mid-run delete (or sidecar-driven clear) has already disposed + // of the initial ack comment, the post-agent hook must not re-issue a DELETE — + // even if `agentInput.ackCommentId` still carries the original id as a legacy + // fallback. Without this gate, the duplicate DELETE returns 404 and produced + // ~72 WARN entries per day on cascade-router (live audit, 2026-04-29). + if (initialCommentIdConsumed) return; // Fall back to ackCommentId stored in agentInput if sessionState wasn't populated const ackCommentId = diff --git a/tests/unit/gadgets/sessionState.test.ts b/tests/unit/gadgets/sessionState.test.ts index 2dbbeb35..704543fa 100644 --- a/tests/unit/gadgets/sessionState.test.ts +++ b/tests/unit/gadgets/sessionState.test.ts @@ -317,6 +317,85 @@ describe('deleteInitialComment', () => { }); }); +// --------------------------------------------------------------------------- +// initialCommentIdConsumed flag (spec 017 / plan 3 — progress-comment-double-delete) +// --------------------------------------------------------------------------- + +describe('initialCommentIdConsumed flag', () => { + beforeEach(() => { + vi.resetAllMocks(); + initSessionState({ agentType: 'implementation' }); + }); + + it('fresh state has initialCommentIdConsumed: false', () => { + const state = getSessionState(); + expect(state.initialCommentIdConsumed).toBe(false); + }); + + it('recordInitialComment does NOT flip the consumed flag', () => { + recordInitialComment(123); + const state = getSessionState(); + expect(state.initialCommentId).toBe(123); + expect(state.initialCommentIdConsumed).toBe(false); + }); + + it('deleteInitialComment on success: clears initialCommentId AND sets initialCommentIdConsumed=true', async () => { + recordInitialComment(99); + mockDeletePRComment.mockResolvedValue(undefined); + + await deleteInitialComment('owner', 'repo'); + + const state = getSessionState(); + expect(state.initialCommentId).toBeNull(); + expect(state.initialCommentIdConsumed).toBe(true); + }); + + it('deleteInitialComment on error: does NOT set consumed (post-agent hook may retry)', async () => { + recordInitialComment(99); + mockDeletePRComment.mockRejectedValue(new Error('5xx server error')); + + await deleteInitialComment('owner', 'repo'); + + const state = getSessionState(); + expect(state.initialCommentId).toBe(99); // restored for retry + expect(state.initialCommentIdConsumed).toBe(false); + }); + + it('deleteInitialComment when initialCommentId is null: no-op, consumed stays false', async () => { + // Don't call recordInitialComment — id stays null + await deleteInitialComment('owner', 'repo'); + + const state = getSessionState(); + expect(state.initialCommentId).toBeNull(); + expect(state.initialCommentIdConsumed).toBe(false); + expect(mockDeletePRComment).not.toHaveBeenCalled(); + }); + + it('clearInitialComment also sets initialCommentIdConsumed=true', () => { + // The sidecar-driven path: backend adapter signals subprocess already deleted. + // The post-agent callback must NOT redundantly delete after this signal. + recordInitialComment(456); + clearInitialComment(); + + const state = getSessionState(); + expect(state.initialCommentId).toBeNull(); + expect(state.initialCommentIdConsumed).toBe(true); + }); + + it('initSessionState resets initialCommentIdConsumed to false', () => { + // Set up a state where consumed=true, then re-init: the new run starts fresh. + recordInitialComment(789); + clearInitialComment(); + expect(getSessionState().initialCommentIdConsumed).toBe(true); + + initSessionState({ agentType: 'review' }); + + const state = getSessionState(); + expect(state.initialCommentId).toBeNull(); + expect(state.initialCommentIdConsumed).toBe(false); + }); +}); + describe('getSessionState', () => { beforeEach(() => { initSessionState({ diff --git a/tests/unit/github/client.test.ts b/tests/unit/github/client.test.ts index a6e8423d..bf859cba 100644 --- a/tests/unit/github/client.test.ts +++ b/tests/unit/github/client.test.ts @@ -324,6 +324,61 @@ describe('githubClient', () => { comment_id: 200, }); }); + + // --------------------------------------------------------------------- + // 404 idempotency (spec 017 / plan 3 — progress-comment-double-delete) + // --------------------------------------------------------------------- + + it('treats HTTP 404 as success: returns without throwing, logs at DEBUG with commentId', async () => { + const { logger } = await import('../../../src/utils/logging.js'); + const debugSpy = vi.mocked(logger.debug); + const warnSpy = vi.mocked(logger.warn); + const errorSpy = vi.mocked(logger.error); + + // Octokit's RequestError shape on 404 + const notFoundErr = Object.assign(new Error('Not Found'), { status: 404 }); + mockIssues.deleteComment.mockRejectedValueOnce(notFoundErr); + + await expect( + withGitHubToken('test-token', () => + githubClient.deletePRComment('owner', 'repo', 4341389855), + ), + ).resolves.toBeUndefined(); + + expect(debugSpy).toHaveBeenCalledWith( + expect.stringMatching(/already deleted|404/i), + expect.objectContaining({ commentId: 4341389855 }), + ); + expect(warnSpy).not.toHaveBeenCalled(); + expect(errorSpy).not.toHaveBeenCalled(); + }); + + it('rethrows on HTTP 5xx (server error)', async () => { + const serverErr = Object.assign(new Error('Server Error'), { status: 503 }); + mockIssues.deleteComment.mockRejectedValueOnce(serverErr); + + await expect( + withGitHubToken('test-token', () => githubClient.deletePRComment('owner', 'repo', 200)), + ).rejects.toThrow('Server Error'); + }); + + it('rethrows on HTTP 401 (auth failure)', async () => { + const authErr = Object.assign(new Error('Bad credentials'), { status: 401 }); + mockIssues.deleteComment.mockRejectedValueOnce(authErr); + + await expect( + withGitHubToken('test-token', () => githubClient.deletePRComment('owner', 'repo', 200)), + ).rejects.toThrow('Bad credentials'); + }); + + it('rethrows non-HTTP errors (no status field)', async () => { + const networkErr = new Error('ECONNRESET'); + mockIssues.deleteComment.mockRejectedValueOnce(networkErr); + + await expect( + withGitHubToken('test-token', () => githubClient.deletePRComment('owner', 'repo', 200)), + ).rejects.toThrow('ECONNRESET'); + }); }); describe('getPRReviews', () => { diff --git a/tests/unit/triggers/github/ack-comments.test.ts b/tests/unit/triggers/github/ack-comments.test.ts index 3ef80edd..48f0c6b2 100644 --- a/tests/unit/triggers/github/ack-comments.test.ts +++ b/tests/unit/triggers/github/ack-comments.test.ts @@ -168,6 +168,64 @@ describe('deleteProgressCommentOnSuccess', () => { deleteProgressCommentOnSuccess(result, makeAgentResult()), ).resolves.toBeUndefined(); }); + + // ------------------------------------------------------------------------- + // initialCommentIdConsumed gating (spec 017 / plan 3 — progress-comment-double-delete) + // ------------------------------------------------------------------------- + + it('skips entirely when initialCommentIdConsumed=true (no DELETE issued)', async () => { + // A gadget already deleted the comment mid-run and signalled consumption. + // The post-agent hook must not issue a redundant DELETE. + mockGetSessionState.mockReturnValue({ + initialCommentId: null, + initialCommentIdConsumed: true, + } as ReturnType); + + const result = makeResult(); + await deleteProgressCommentOnSuccess(result, makeAgentResult()); + + expect(mockGithubClient.deletePRComment).not.toHaveBeenCalled(); + }); + + it('consumed=true wins over agentInput.ackCommentId fallback (the headline regression pin)', async () => { + // Live incident shape: gadget cleared session state but agentInput still + // holds the original ackCommentId. Without the consumed gate, the hook + // re-deletes the already-gone comment and produces a 404 WARN. + mockGetSessionState.mockReturnValue({ + initialCommentId: null, + initialCommentIdConsumed: true, + } as ReturnType); + + const result = makeResult({ + agentInput: { + repoFullName: 'acme/myapp', + ackCommentId: 4341389855, // a real comment id from the prod incident + }, + } as Partial); + + await deleteProgressCommentOnSuccess(result, makeAgentResult()); + + expect(mockGithubClient.deletePRComment).not.toHaveBeenCalled(); + }); + + it('consumed=false preserves the legacy fallback to agentInput.ackCommentId (Spec AC #8)', async () => { + // Paths that never populate session state (older code paths) must still work. + mockGetSessionState.mockReturnValue({ + initialCommentId: null, + initialCommentIdConsumed: false, + } as ReturnType); + + const result = makeResult({ + agentInput: { + repoFullName: 'acme/myapp', + ackCommentId: 999, + }, + } as Partial); + + await deleteProgressCommentOnSuccess(result, makeAgentResult()); + + expect(mockGithubClient.deletePRComment).toHaveBeenCalledWith('acme', 'myapp', 999); + }); }); describe('updateInitialCommentWithError', () => { From b2ba6a39ebfefa1b4744425e95cb9244c3cf9f9c Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:34:10 +0000 Subject: [PATCH 09/22] chore(plan): 017/1 lock --- .../{1-pm-ack-coverage.md => 1-pm-ack-coverage.md.wip} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/plans/017-router-silent-failure-hardening/{1-pm-ack-coverage.md => 1-pm-ack-coverage.md.wip} (99%) diff --git a/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md b/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.wip similarity index 99% rename from docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md rename to docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.wip index ba4be3b2..daa21b7f 100644 --- a/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md +++ b/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.wip @@ -6,7 +6,7 @@ plan_slug: pm-ack-coverage level: plan parent_spec: docs/specs/017-router-silent-failure-hardening.md depends_on: [] -status: pending +status: wip --- # 017/1: PM-ack dispatch coverage (Linear branch missing) From cfb0afa95fb3f3034de709e4e88c4547fa7e2b9a Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:42:48 +0000 Subject: [PATCH 10/22] fix(router): consolidate PM-ack dispatch via manifest registry, restore Linear coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan 017/1 (pm-ack-coverage). Closes failure mode A from spec 017's 24h log audit on 2026-04-29: PM-focused agents (e.g. backlog-manager) triggered from a GitHub webhook against Linear-based projects silently skipped their PM-side ack. The router-adapter's local `postPMAck` helper had `if (pmType === 'trello')` / `if (pmType === 'jira')` literal branches but no Linear case, so control fell through to `WARN: Unknown PM type for PM-focused agent ack, skipping` — 24× per day on prod cascade-router, all from `ucho`. A near-identical helper at `src/triggers/shared/pm-ack.ts:postPMAckComment` had the Linear branch, so this was pure parallel-path drift between two helpers that should have been one (same shape as PR #1220). Three changes: 1. New consolidated helper `dispatchPMAck` at `src/router/pm-ack-dispatch.ts`. Indexes the manifest registry directly via `getPMProvider(pmType).platformClientFactory(projectId).postComment(...)`. Zero per-PM-type literal branching anywhere on the dispatch surface. Adding a future PM provider lands the dispatch path for free. 2. Both legacy call sites delegate: - `src/router/adapters/github.ts:postPMAck` (the buggy one) - `src/triggers/shared/pm-ack.ts:postPMAckComment` (had Linear; preserves the existing `string | null` return contract via String() normalization) 3. The "Unknown PM type" branch converts from silent WARN+skip to ERROR-level log + Sentry capture under stable tag `pm_ack_unknown_pm_type`. Mirrors the spec-015 `wedged_lock_canary` precedent. Once the consolidation ships, hitting that branch represents a real configuration error (project pinned to a deleted provider), not steady-state noise. Regression nets: - Per-provider conformance assertion: the existing PM manifest harness gains one new `it()` inside its `describe.each` block — `dispatchPMAck reaches this provider without throwing`. Adding a future provider whose `platformClientFactory` is misconfigured fails CI loudly. - Static guard against future literal-branching drift: `tests/unit/router/pm-ack-dispatch.test.ts` reads each of the three call sites' source and asserts no `pmType === 'trello' | 'jira' | 'linear'` patterns appear within their bodies. Modeled on `trigger-event-consistency.test.ts`. A future maintainer who hand-codes a branch fails this guard with a precise file:line citation. Test coverage: +6 tests on `dispatchPMAck` (Trello/JIRA/Linear happy paths, null-from-postComment, unknown-pmType Sentry path, undefined-pmType same), +4 on `postPMAckComment` delegation contract, +1 per-provider assertion in the conformance harness (4 currently-registered providers including the test fixture), +1 Linear regression pin on the github router adapter's `postAck` public surface, plus 2 existing github adapter tests migrated from mocking `postTrelloAck` to mocking `dispatchPMAck` (the dependency direction changed). Full suite: 472 files / 8668 tests (+19 from baseline). Doc updates: - `src/integrations/README.md` gains a "PM-ack dispatch coverage invariant" subsection in the conformance harness section, documenting the no-literal- branching rule, the Sentry-tag escalation, and pointing at both regression nets. - `CHANGELOG.md` entry under spec 017. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + ...erage.md.wip => 1-pm-ack-coverage.md.done} | 24 +- src/integrations/README.md | 14 +- src/router/adapters/github.ts | 29 +-- src/router/pm-ack-dispatch.ts | 69 ++++++ src/triggers/shared/pm-ack.ts | 36 +-- .../unit/integrations/pm-conformance.test.ts | 36 +++ tests/unit/router/adapters/github.test.ts | 81 ++++++- tests/unit/router/pm-ack-dispatch.test.ts | 213 ++++++++++++++++++ tests/unit/triggers/shared/pm-ack.test.ts | 61 +++++ 10 files changed, 502 insertions(+), 62 deletions(-) rename docs/plans/017-router-silent-failure-hardening/{1-pm-ack-coverage.md.wip => 1-pm-ack-coverage.md.done} (90%) create mode 100644 src/router/pm-ack-dispatch.ts create mode 100644 tests/unit/router/pm-ack-dispatch.test.ts create mode 100644 tests/unit/triggers/shared/pm-ack.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index f016d67f..c3723177 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable user-visible changes to CASCADE are documented here. The format is l ### Changed +- **PM-ack dispatch consolidation: Linear-based PM-focused agents now post their PM-side ack comment** (spec 017, plan 1 of 3). PM-focused agents (e.g. `backlog-manager`) triggered from a GitHub webhook used to silently skip their PM-side ack on Linear projects: the router-adapter's local `postPMAck` helper had `if (pmType === 'trello')` / `if (pmType === 'jira')` branches but no Linear branch, so Linear-based projects fell through to a `WARN: Unknown PM type for PM-focused agent ack, skipping` and never saw the "🔧 On it" comment that Trello/JIRA projects got (24 silent skips per day on cascade-router, all from `ucho`, verified 2026-04-29). A near-identical helper at `src/triggers/shared/pm-ack.ts` already had the Linear branch — pure parallel-path drift. The fix introduces a single consolidated helper `dispatchPMAck` at `src/router/pm-ack-dispatch.ts` that indexes the manifest registry directly and invokes `manifest.platformClientFactory(projectId).postComment(...)` — no per-PM-type literal branching anywhere on the dispatch surface. Both legacy call sites delegate. The PM manifest conformance harness gains a per-provider `dispatchPMAck reaches this provider without throwing` assertion, and a static-guard test pins "no `pmType === ''` branching" against all three call sites; adding a future PM provider to the registry lands the dispatch path for free. Genuinely-unknown PM types (configuration error: project pinned to a deleted provider) now log at ERROR + capture to Sentry under stable tag `pm_ack_unknown_pm_type` instead of a silent WARN. See [spec 017](docs/specs/017-router-silent-failure-hardening.md). - **Progress-comment lifecycle: post-agent cleanup hook now skips when an in-run gadget already deleted the comment** (spec 017, plan 3 of 3). The post-agent `deleteProgressCommentOnSuccess` hook used to read `sessionState.initialCommentId`, fall back to `result.agentInput.ackCommentId` when session state was empty, and issue a redundant DELETE — but "session state cleared by a gadget" was indistinguishable from "session state never populated", so the fallback fired and re-deleted comments that were already gone. GitHub returned 404 and `WARN: Failed to delete progress comment after agent success` was logged 72 times per day on cascade-router (live audit on 2026-04-29). Adds an explicit `initialCommentIdConsumed: boolean` flag on `SessionStateData`. Both `deleteInitialComment` (gadget-driven) and `clearInitialComment` (sidecar-driven) now set the flag to `true` after disposing of the comment. The post-agent hook checks the flag first and skips the entire deletion path — including the legacy `agentInput.ackCommentId` fallback — when consumed. As defense in depth, `githubClient.deletePRComment` now treats HTTP 404 as success (RFC-7231 idempotency) and logs at DEBUG instead of letting the error bubble as a WARN; other HTTP errors (5xx, 401, network) continue to throw. The legacy fallback to `agentInput.ackCommentId` continues to work for code paths that never populate session state. See [spec 017](docs/specs/017-router-silent-failure-hardening.md). - **PM image delivery: Linear GraphQL fixture + extraction-coverage regression test** (spec 016, plan 3 of 3). Captures a reconstructed Linear `Issue` GraphQL payload at `tests/fixtures/linear-issue-with-screenshot.json` containing extension-less and extensioned inline-pasted images (description + comment bodies) plus formal Attachment records (Slack/GitHub/Sentry link previews) that must NOT be mistaken for inline images. The unit test at `tests/unit/pm/linear/extraction-coverage.test.ts` pins the contract and fails loudly with a specific URL-missing message if Linear ever changes its payload shape in a way that loses inline images. Documents the conclusion in `src/integrations/README.md`: `Issue.description` markdown is canonical for Linear inline images; `Issue.attachments` is the wrong surface (formal Attachment records, not pastes). No production code change — this plan ships the regression net for the contract Plans 1+2 established. See [spec 016](docs/specs/016-pm-image-delivery-reliability.md). - **PM image delivery: runtime `cascade-tools pm read-work-item` gadget now delivers images on disk** (spec 016, plan 2 of 3). The runtime gadget that agents call mid-run for a work item used to return text only — its "Pre-fetched Images" section listed URL refs but no local file paths, so an agent that needed to re-read a work item (e.g. after a teammate added a screenshot) had no way to actually see the new image. After this plan, the gadget downloads any image media present and writes it to `.cascade/context/images/work-item--img-.` (extension derived from the resolved Content-Type MIME), then returns text whose new "Local Image Files" section lists actual file paths the agent's file-read tool can consume. Failed downloads are surfaced in a "Failed Image Downloads" subsection so they're never silently dropped. Same diagnostic log line as the boot path (`[image-pipeline] work-item-fetch summary`) — operators see consistent shape across boot and runtime fetches. Closes the mid-run pickup gap. See [spec 016](docs/specs/016-pm-image-delivery-reliability.md). diff --git a/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.wip b/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.done similarity index 90% rename from docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.wip rename to docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.done index daa21b7f..89eb435f 100644 --- a/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.wip +++ b/docs/plans/017-router-silent-failure-hardening/1-pm-ack-coverage.md.done @@ -6,7 +6,7 @@ plan_slug: pm-ack-coverage level: plan parent_spec: docs/specs/017-router-silent-failure-hardening.md depends_on: [] -status: wip +status: done --- # 017/1: PM-ack dispatch coverage (Linear branch missing) @@ -197,14 +197,14 @@ n/a — all ACs auto-tested. ## Progress -- [ ] AC #1 -- [ ] AC #2 -- [ ] AC #3 -- [ ] AC #4 -- [ ] AC #5 -- [ ] AC #6 -- [ ] AC #7 -- [ ] AC #8 -- [ ] AC #9 -- [ ] AC #10 -- [ ] AC #11 +- [x] AC #1 — `dispatchPMAck` exists at `src/router/pm-ack-dispatch.ts`, indexes the manifest registry, no `pmType` literal branching (verified by `tests/unit/router/pm-ack-dispatch.test.ts > dispatchPMAck` happy-paths for all three PM types + the static no-literal-branching guard). +- [x] AC #2 — `postPMAck` in `src/router/adapters/github.ts` delegates to `dispatchPMAck`; static guard rejects literal branching (verified by the static guard test in `pm-ack-dispatch.test.ts`). +- [x] AC #3 — `postPMAckComment` in `src/triggers/shared/pm-ack.ts` delegates to `dispatchPMAck` and preserves the `string | null` return contract (verified by `tests/unit/triggers/shared/pm-ack.test.ts`). +- [x] AC #4 — Linear-based PM-focused agents reach the dispatch helper (verified by the new `routes ack to PM tool for Linear-based projects (spec 017 plan 1 regression pin)` test in `tests/unit/router/adapters/github.test.ts` + the pm-ack-dispatch Linear happy-path). +- [x] AC #5 — Unknown PM types log ERROR + Sentry capture under `pm_ack_unknown_pm_type` (verified by the dispatch helper's "unknown pmType" + "undefined pmType" tests). +- [x] AC #6 — Conformance harness asserts dispatch coverage per registered provider (verified by the new per-provider `dispatchPMAck reaches this provider without throwing` assertion in `pm-conformance.test.ts`'s `describe.each` block). +- [x] AC #7 — `src/integrations/README.md` documents the dispatch coverage invariant (new "PM-ack dispatch coverage invariant (spec 017 plan 1)" subsection added). +- [x] AC #8 — All new/modified code has tests (+6 in pm-ack-dispatch, +4 in pm-ack, +1 per-provider in conformance harness, +1 Linear regression pin in github adapter test, plus 2 existing github adapter tests migrated from postTrelloAck to dispatchPMAck mocking). +- [x] AC #9 — `npm run typecheck` passes. +- [x] AC #10 — `npm test` (full unit suite) passes (472 files / 8668 tests, +19 from baseline). +- [x] AC #11 — `npm run lint` passes (Biome clean). diff --git a/src/integrations/README.md b/src/integrations/README.md index b037bf37..55b9d947 100644 --- a/src/integrations/README.md +++ b/src/integrations/README.md @@ -152,8 +152,20 @@ All fields are optional; legacy manifests that don't declare them skip the corre - `triggerHandlers` have unique names - `platformClientFactory(projectId)` returns an object with `postComment` + `deleteComment` - `pmIntegration.type` is wired +- `dispatchPMAck` (the consolidated PM-ack helper at `src/router/pm-ack-dispatch.ts`) reaches this provider without throwing — pinned by the per-provider assertion added in spec 017 plan 1 -A `TestProvider` fixture in `tests/helpers/testPMProvider.ts` is the minimal reference implementation — copy its shape when starting a new provider. The harness runs against TestProvider + Trello + JIRA + Linear (44 assertions total). +A `TestProvider` fixture in `tests/helpers/testPMProvider.ts` is the minimal reference implementation — copy its shape when starting a new provider. The harness runs against TestProvider + Trello + JIRA + Linear. + +### PM-ack dispatch coverage invariant (spec 017 plan 1) + +Router-side PM acknowledgment posting (the comment that says "🔧 On it" on the PM card when a PM-focused agent like `backlog-manager` starts work, triggered from a GitHub webhook) goes through **one** code path: `dispatchPMAck` in `src/router/pm-ack-dispatch.ts`. That helper looks up the provider in the manifest registry and invokes `manifest.platformClientFactory(projectId).postComment(workItemId, message)` directly — **no `pmType` literal branching anywhere on the dispatch surface**. + +The consolidation closed a parallel-path drift incident verified live on 2026-04-29 (`ucho`): the router-adapter's local helper had Trello + JIRA branches but no Linear branch, so PM-focused agents triggered against Linear-based projects silently skipped their ack with `WARN: Unknown PM type for PM-focused agent ack, skipping` (24× per day in prod). A sibling helper at `src/triggers/shared/pm-ack.ts` had all three branches; both now delegate to `dispatchPMAck`. + +A new PM provider lands the dispatch path **for free** the moment its manifest is registered — no edits to `pm-ack-dispatch.ts` or to either of the call sites. Failure modes: +- Provider's `platformClientFactory` returns a client whose `postComment` throws → conformance harness's `dispatchPMAck reaches this provider without throwing` assertion fails in CI with a precise per-provider message. +- A future maintainer adds `if (pmType === 'asana')` branching to either call site → the static guard at `tests/unit/router/pm-ack-dispatch.test.ts` (PM-ack dispatch surface: no literal pm-type branching) fails loudly with a file:line citation. +- Project pinned to a `pm.type` that's no longer in the registry (configuration error) → `dispatchPMAck` logs at ERROR + captures Sentry under tag `pm_ack_unknown_pm_type` (no longer a silent WARN). ### Provider migration status (plan 009 — PM integration hardening) diff --git a/src/router/adapters/github.ts b/src/router/adapters/github.ts index e187f5cb..303b9ef8 100644 --- a/src/router/adapters/github.ts +++ b/src/router/adapters/github.ts @@ -24,15 +24,11 @@ import type { ProjectConfig, TriggerContext, TriggerResult } from '../../types/i import { logger } from '../../utils/logging.js'; import { buildWorkItemRunsLink, getDashboardUrl } from '../../utils/runLink.js'; import { extractGitHubContext, generateAckMessage } from '../ackMessageGenerator.js'; -import { - postGitHubAck, - postJiraAck, - postTrelloAck, - resolveGitHubTokenForAckByAgent, -} from '../acknowledgments.js'; +import { postGitHubAck, resolveGitHubTokenForAckByAgent } from '../acknowledgments.js'; import { loadProjectConfig, type RouterProjectConfig } from '../config.js'; import { extractPRNumber } from '../notifications.js'; import type { AckResult, ParsedWebhookEvent, RouterPlatformAdapter } from '../platform-adapter.js'; +import { dispatchPMAck } from '../pm-ack-dispatch.js'; import { addEyesReactionToPR } from '../pre-actions.js'; import type { CascadeJob, GitHubJob } from '../queue.js'; import { sendAcknowledgeReaction } from '../reactions.js'; @@ -42,8 +38,11 @@ import { sendAcknowledgeReaction } from '../reactions.js'; // --------------------------------------------------------------------------- /** - * Post an acknowledgment comment to the PM tool (Trello/JIRA) for PM-focused agents. - * Returns an AckResult with the comment ID and message, or undefined on failure. + * Post an acknowledgment comment to the PM tool for PM-focused agents. + * + * Delegates to the consolidated `dispatchPMAck` helper which indexes the + * manifest registry — no per-PM-type literal branching. See spec 017, + * failure mode A. */ async function postPMAck( projectId: string, @@ -52,17 +51,9 @@ async function postPMAck( agentType: string, message: string, ): Promise { - let commentId: string | null = null; - if (pmType === 'trello') { - commentId = await postTrelloAck(projectId, workItemId, message); - } else if (pmType === 'jira') { - commentId = await postJiraAck(projectId, workItemId, message); - } else { - logger.warn('Unknown PM type for PM-focused agent ack, skipping', { agentType, pmType }); - return undefined; - } - if (commentId) return { commentId, message }; - return undefined; + const result = await dispatchPMAck({ projectId, workItemId, pmType, message, agentType }); + if (!result) return undefined; + return { commentId: result.commentId, message: result.message }; } /** diff --git a/src/router/pm-ack-dispatch.ts b/src/router/pm-ack-dispatch.ts new file mode 100644 index 00000000..ec9e88fb --- /dev/null +++ b/src/router/pm-ack-dispatch.ts @@ -0,0 +1,69 @@ +/** + * Consolidated PM-ack dispatch helper. + * + * Replaces the parallel-path drift between two near-identical helpers: + * + * - `src/router/adapters/github.ts:postPMAck` (had Trello + JIRA only; + * silently skipped Linear, + * producing 24 WARN/day on + * prod cascade-router) + * - `src/triggers/shared/pm-ack.ts:postPMAckComment` (had all three branches) + * + * Both call sites now delegate to this helper. The helper indexes the manifest + * registry directly via `getPMProvider(pmType).platformClientFactory(projectId)` + * — no per-PM-type literal branching. Adding a future PM provider to the + * registry is automatically reachable from the dispatch path; the conformance + * harness asserts this on every CI run. + * + * On a genuinely-unknown PM type (project pinned to a deleted provider, or a + * configuration error), the helper logs at ERROR and captures to Sentry under + * the stable tag `pm_ack_unknown_pm_type` — silent warn-and-skip is removed. + * + * See spec 017 (router-side silent-failure hardening), failure mode A. + */ + +import { getPMProvider } from '../integrations/pm/registry.js'; +import { captureException } from '../sentry.js'; +import { logger } from '../utils/logging.js'; + +export interface DispatchPMAckArgs { + projectId: string; + workItemId: string; + pmType: string | undefined; + message: string; + agentType?: string; +} + +export interface PMAckResult { + commentId: string | number; + message: string; +} + +/** + * Post a PM-side acknowledgment comment via the manifest registry. + * + * Returns `{ commentId, message }` on success, or `undefined` when: + * - the underlying `platformClientFactory.postComment` returned `null` + * (existing failure-shape contract on `PlatformCommentClient`), OR + * - the `pmType` is not registered in the manifest registry (logs ERROR + * + captures Sentry under tag `pm_ack_unknown_pm_type`). + */ +export async function dispatchPMAck(args: DispatchPMAckArgs): Promise { + const { projectId, workItemId, pmType, message, agentType } = args; + + const manifest = pmType ? getPMProvider(pmType) : null; + if (!manifest) { + const err = new Error('Unknown PM type for PM-focused agent ack'); + logger.error('Unknown PM type for PM-focused agent ack', { pmType, agentType, projectId }); + captureException(err, { + tags: { source: 'pm_ack_unknown_pm_type' }, + extra: { pmType, agentType, projectId, workItemId }, + }); + return undefined; + } + + const client = manifest.platformClientFactory(projectId); + const commentId = await client.postComment(workItemId, message); + if (commentId == null) return undefined; + return { commentId, message }; +} diff --git a/src/triggers/shared/pm-ack.ts b/src/triggers/shared/pm-ack.ts index 34db80e1..4cb04bf7 100644 --- a/src/triggers/shared/pm-ack.ts +++ b/src/triggers/shared/pm-ack.ts @@ -8,18 +8,20 @@ * Used by: * - Worker-side: `triggers/github/webhook-handler.ts` (maybePostPmAckComment) * - * Note: `router/adapters/github.ts` has its own local `postPMAck` function - * and does not use this shared utility. + * After spec 017 / plan 1, this delegates to `dispatchPMAck` in + * `src/router/pm-ack-dispatch.ts` — the single source of truth for PM-ack + * dispatch. No per-PM-type literal branching here. The legacy `string | null` + * return contract is preserved for the existing call site in + * `src/triggers/github/webhook-handler.ts:maybePostPmAckComment`. */ -import { postJiraAck, postLinearAck, postTrelloAck } from '../../router/acknowledgments.js'; -import { logger } from '../../utils/logging.js'; +import { dispatchPMAck } from '../../router/pm-ack-dispatch.js'; /** - * Post a PM acknowledgment comment to Trello, JIRA, or Linear. + * Post a PM acknowledgment comment via the consolidated dispatch helper. * - * Returns the comment ID if successfully posted, or null if the PM type - * is not supported or posting failed. + * Returns the comment ID as a string if successfully posted, or `null` if + * the PM type is not supported or posting failed. * * @param projectId The project ID for credential resolution. * @param workItemId The work item ID to post the comment on (card ID / issue key). @@ -34,21 +36,7 @@ export async function postPMAckComment( message: string, agentType?: string, ): Promise { - if (pmType === 'trello') { - return postTrelloAck(projectId, workItemId, message); - } - - if (pmType === 'jira') { - return postJiraAck(projectId, workItemId, message); - } - - if (pmType === 'linear') { - return postLinearAck(projectId, workItemId, message); - } - - logger.warn('Unknown PM type for PM-focused agent ack, skipping', { - agentType, - pmType, - }); - return null; + const result = await dispatchPMAck({ projectId, workItemId, pmType, message, agentType }); + if (!result) return null; + return String(result.commentId); } diff --git a/tests/unit/integrations/pm-conformance.test.ts b/tests/unit/integrations/pm-conformance.test.ts index 5e2dbe1a..5b5534b7 100644 --- a/tests/unit/integrations/pm-conformance.test.ts +++ b/tests/unit/integrations/pm-conformance.test.ts @@ -154,6 +154,42 @@ describe('PM provider conformance (every registered provider)', () => { expect(typeof client.deleteComment).toBe('function'); }); + // Spec 017 / plan 1: every registered manifest must be reachable from + // the consolidated PM-ack dispatch helper. Adding a future provider + // without a working `platformClientFactory` produces a CI failure here. + // This is the regression net for failure mode A from the 2026-04-29 + // audit, where Linear-based PM-focused agents silently skipped their + // PM-side ack because the github router adapter's local `postPMAck` + // helper had only Trello + JIRA branches (no `linear`). + // + // Strategy: dispatch against the real registry. `postComment` may + // resolve credentials and return null when none are configured (the + // expected outcome in this test environment). What is NOT acceptable + // is the dispatch helper throwing or hitting its unknown-PM-type + // Sentry path — both indicate the provider is unreachable from the + // dispatch surface. We therefore assert the result is either undefined + // or has the AckResult shape, and that no exception propagates. + it('dispatchPMAck reaches this provider without throwing', async () => { + const { dispatchPMAck } = await import('../../../src/router/pm-ack-dispatch.js'); + + const result = await dispatchPMAck({ + projectId: 'proj-conformance', + workItemId: 'item-conformance', + pmType: id, + message: 'conformance check', + agentType: 'backlog-manager', + }); + + if (result !== undefined) { + expect(result).toEqual( + expect.objectContaining({ + commentId: expect.anything(), + message: 'conformance check', + }), + ); + } + }); + it('pmIntegration is wired (type matches id)', () => { // Confirms the manifest plumbs the PMIntegration. Actual behavior of // parseWebhookPayload on the integration is tested per-provider; the diff --git a/tests/unit/router/adapters/github.test.ts b/tests/unit/router/adapters/github.test.ts index a7a92407..40d36adb 100644 --- a/tests/unit/router/adapters/github.test.ts +++ b/tests/unit/router/adapters/github.test.ts @@ -22,6 +22,13 @@ vi.mock('../../../../src/router/acknowledgments.js', () => ({ postJiraAck: vi.fn(), resolveGitHubTokenForAckByAgent: vi.fn(), })); +// Spec 017 / plan 1: postPMAck now delegates to dispatchPMAck (the consolidated +// helper that consumes the manifest registry). Mock at the dispatch level so +// the test exercises the github router adapter's wiring without pulling the +// real provider registry into the assertion. +vi.mock('../../../../src/router/pm-ack-dispatch.js', () => ({ + dispatchPMAck: vi.fn(), +})); vi.mock('../../../../src/agents/definitions/loader.js', () => ({ isPMFocusedAgent: vi.fn().mockResolvedValue(false), @@ -86,6 +93,7 @@ import { GitHubRouterAdapter, injectEventType } from '../../../../src/router/ada import type { RouterProjectConfig } from '../../../../src/router/config.js'; import { loadProjectConfig } from '../../../../src/router/config.js'; import { extractPRNumber } from '../../../../src/router/notifications.js'; +import { dispatchPMAck } from '../../../../src/router/pm-ack-dispatch.js'; import { addEyesReactionToPR } from '../../../../src/router/pre-actions.js'; import type { GitHubJob } from '../../../../src/router/queue.js'; import { sendAcknowledgeReaction } from '../../../../src/router/reactions.js'; @@ -366,9 +374,12 @@ describe('GitHubRouterAdapter', () => { expect(ackResult?.message).toBe('Starting implementation...'); }); - it('routes ack to PM tool (Trello) for PM-focused agents (backlog-manager)', async () => { + it('routes ack to PM tool for PM-focused agents (backlog-manager)', async () => { vi.mocked(isPMFocusedAgent).mockResolvedValue(true); - vi.mocked(postTrelloAck).mockResolvedValue('trello-comment-id'); + vi.mocked(dispatchPMAck).mockResolvedValue({ + commentId: 'pm-comment-id', + message: 'Starting...', + }); const ackResult = await adapter.postAck( { @@ -385,14 +396,25 @@ describe('GitHubRouterAdapter', () => { { agentType: 'backlog-manager', agentInput: {}, workItemId: 'card-123' }, ); - expect(postTrelloAck).toHaveBeenCalledWith('p1', 'card-123', expect.any(String)); + expect(dispatchPMAck).toHaveBeenCalledWith( + expect.objectContaining({ + projectId: 'p1', + workItemId: 'card-123', + pmType: 'trello', + agentType: 'backlog-manager', + message: expect.any(String), + }), + ); expect(postGitHubAck).not.toHaveBeenCalled(); - expect(ackResult?.commentId).toBe('trello-comment-id'); + expect(ackResult?.commentId).toBe('pm-comment-id'); }); it('uses triggerResult.workItemId over event.workItemId for PM-focused agents', async () => { vi.mocked(isPMFocusedAgent).mockResolvedValue(true); - vi.mocked(postTrelloAck).mockResolvedValue('comment-from-trigger'); + vi.mocked(dispatchPMAck).mockResolvedValue({ + commentId: 'comment-from-trigger', + message: 'Starting...', + }); await adapter.postAck( { @@ -409,7 +431,54 @@ describe('GitHubRouterAdapter', () => { { agentType: 'backlog-manager', agentInput: {}, workItemId: 'trigger-card-id' }, ); - expect(postTrelloAck).toHaveBeenCalledWith('p1', 'trigger-card-id', expect.any(String)); + expect(dispatchPMAck).toHaveBeenCalledWith( + expect.objectContaining({ + projectId: 'p1', + workItemId: 'trigger-card-id', + }), + ); + }); + + it('routes ack to PM tool for Linear-based projects (spec 017 plan 1 regression pin)', async () => { + // Failure mode A from spec 017's 2026-04-29 audit: Linear-based + // projects had no PM ack posted because `postPMAck` only handled + // Trello + JIRA. Now `postPMAck` delegates to `dispatchPMAck` which + // indexes the manifest registry — Linear is reachable for free. + vi.mocked(isPMFocusedAgent).mockResolvedValue(true); + vi.mocked(dispatchPMAck).mockResolvedValue({ + commentId: 'linear-comment-uuid', + message: 'Starting...', + }); + + const linearProject: RouterProjectConfig = { + id: 'ucho', + repo: 'org/ucho', + pmType: 'linear', + }; + + const ackResult = await adapter.postAck( + { + projectIdentifier: 'org/ucho', + eventType: 'pull_request', + workItemId: 'MNG-100', + isCommentEvent: false, + // @ts-expect-error extended field + repoFullName: 'org/ucho', + }, + {}, + linearProject, + 'backlog-manager', + { agentType: 'backlog-manager', agentInput: {}, workItemId: 'MNG-100' }, + ); + + expect(dispatchPMAck).toHaveBeenCalledWith( + expect.objectContaining({ + projectId: 'ucho', + workItemId: 'MNG-100', + pmType: 'linear', + }), + ); + expect(ackResult?.commentId).toBe('linear-comment-uuid'); }); it('uses triggerResult.workItemId over event.workItemId for GitHub PR run link', async () => { diff --git a/tests/unit/router/pm-ack-dispatch.test.ts b/tests/unit/router/pm-ack-dispatch.test.ts new file mode 100644 index 00000000..57650a1b --- /dev/null +++ b/tests/unit/router/pm-ack-dispatch.test.ts @@ -0,0 +1,213 @@ +/** + * Tests for the consolidated PM-ack dispatch helper at + * `src/router/pm-ack-dispatch.ts`. Replaces the parallel-path drift between + * `src/router/adapters/github.ts:postPMAck` (which lacked the Linear branch + * and silently skipped Linear-based projects) and + * `src/triggers/shared/pm-ack.ts:postPMAckComment`. The new helper indexes + * the manifest registry directly — no per-PM-type literal branching — so + * adding a future provider to the registry is automatically reachable from + * the dispatch path. + */ +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const { mockGetPMProvider, mockCaptureException } = vi.hoisted(() => ({ + mockGetPMProvider: vi.fn(), + mockCaptureException: vi.fn(), +})); + +vi.mock('../../../src/integrations/pm/registry.js', () => ({ + getPMProvider: mockGetPMProvider, + listPMProviders: () => [], +})); + +vi.mock('../../../src/sentry.js', () => ({ + captureException: mockCaptureException, +})); + +vi.mock('../../../src/utils/logging.js', () => ({ + logger: { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + }, +})); + +import { dispatchPMAck } from '../../../src/router/pm-ack-dispatch.js'; +import { logger } from '../../../src/utils/logging.js'; + +const mockLoggerError = vi.mocked(logger.error); +const mockLoggerWarn = vi.mocked(logger.warn); + +function makeManifest(id: string, postCommentImpl: () => Promise) { + return { + id, + platformClientFactory: vi.fn(() => ({ + postComment: vi.fn(postCommentImpl), + deleteComment: vi.fn(), + })), + }; +} + +describe('dispatchPMAck', () => { + beforeEach(() => { + vi.resetAllMocks(); + }); + + it('Trello: invokes platformClientFactory(projectId).postComment(workItemId, message) and returns AckResult', async () => { + const manifest = makeManifest('trello', async () => 'comment-trello-123'); + mockGetPMProvider.mockReturnValue(manifest); + + const result = await dispatchPMAck({ + projectId: 'proj-1', + workItemId: 'card-abc', + pmType: 'trello', + message: 'Working on it', + agentType: 'backlog-manager', + }); + + expect(mockGetPMProvider).toHaveBeenCalledWith('trello'); + expect(manifest.platformClientFactory).toHaveBeenCalledWith('proj-1'); + expect(result).toEqual({ commentId: 'comment-trello-123', message: 'Working on it' }); + }); + + it('JIRA: same shape, returns the id from platformClientFactory', async () => { + const manifest = makeManifest('jira', async () => 'jira-789'); + mockGetPMProvider.mockReturnValue(manifest); + + const result = await dispatchPMAck({ + projectId: 'proj-2', + workItemId: 'PROJ-1', + pmType: 'jira', + message: 'Working on it', + }); + + expect(result).toEqual({ commentId: 'jira-789', message: 'Working on it' }); + }); + + it('Linear: same shape (the failure mode A regression pin)', async () => { + // This is the assertion that today's broken `postPMAck` in github.ts + // fails for Linear-based projects. After consolidation, Linear is + // reachable through the same dispatch path as Trello/JIRA. + const manifest = makeManifest('linear', async () => 'linear-id-uuid'); + mockGetPMProvider.mockReturnValue(manifest); + + const result = await dispatchPMAck({ + projectId: 'ucho', + workItemId: 'MNG-100', + pmType: 'linear', + message: 'On it', + agentType: 'backlog-manager', + }); + + expect(result).toEqual({ commentId: 'linear-id-uuid', message: 'On it' }); + }); + + it('returns undefined when platformClientFactory.postComment returns null', async () => { + const manifest = makeManifest('trello', async () => null); + mockGetPMProvider.mockReturnValue(manifest); + + const result = await dispatchPMAck({ + projectId: 'proj-1', + workItemId: 'card-abc', + pmType: 'trello', + message: 'msg', + }); + + expect(result).toBeUndefined(); + }); + + it('unknown pmType (not in registry): logs at ERROR, captures Sentry under tag pm_ack_unknown_pm_type, returns undefined', async () => { + mockGetPMProvider.mockReturnValue(null); // not registered + + const result = await dispatchPMAck({ + projectId: 'proj-x', + workItemId: 'item', + pmType: 'asana', + message: 'msg', + agentType: 'backlog-manager', + }); + + expect(result).toBeUndefined(); + expect(mockLoggerError).toHaveBeenCalledWith( + expect.stringMatching(/Unknown PM type for PM-focused agent ack/i), + expect.objectContaining({ pmType: 'asana', agentType: 'backlog-manager' }), + ); + expect(mockLoggerWarn).not.toHaveBeenCalled(); + expect(mockCaptureException).toHaveBeenCalledWith( + expect.any(Error), + expect.objectContaining({ + tags: expect.objectContaining({ source: 'pm_ack_unknown_pm_type' }), + extra: expect.objectContaining({ pmType: 'asana', agentType: 'backlog-manager' }), + }), + ); + }); + + it('undefined pmType (project not configured): same Sentry-captured error path', async () => { + mockGetPMProvider.mockReturnValue(null); + + const result = await dispatchPMAck({ + projectId: 'proj-x', + workItemId: 'item', + pmType: undefined, + message: 'msg', + }); + + expect(result).toBeUndefined(); + expect(mockCaptureException).toHaveBeenCalledWith( + expect.any(Error), + expect.objectContaining({ + tags: expect.objectContaining({ source: 'pm_ack_unknown_pm_type' }), + }), + ); + }); +}); + +// --------------------------------------------------------------------------- +// Static guard: no PM-type literal branching on the consolidated dispatch path. +// +// The structural invariant of plan 017/1: dispatch consumes the manifest +// registry directly. A future maintainer who adds `if (pmType === 'asana')` +// branch should fail this guard. Modeled on the trigger-event-consistency.ts +// pattern. +// --------------------------------------------------------------------------- + +describe('PM-ack dispatch surface: no literal pm-type branching (regression guard)', () => { + const root = join(__dirname, '..', '..', '..', 'src'); + + const surfaces = [ + { file: 'router/pm-ack-dispatch.ts', label: 'dispatchPMAck helper' }, + { file: 'router/adapters/github.ts', label: 'GitHub router adapter postPMAck' }, + { file: 'triggers/shared/pm-ack.ts', label: 'shared postPMAckComment' }, + ]; + + for (const { file, label } of surfaces) { + it(`${file} does not branch on pmType literal strings (${label})`, () => { + const path = join(root, file); + const src = readFileSync(path, 'utf-8'); + + // Match patterns like `pmType === 'trello'` / `pmType === "jira"` / + // case-style comparisons. Whitelist comments so doc references can + // still mention PM type names. + const codeOnly = src + .split('\n') + .filter((line) => !line.trim().startsWith('//') && !line.trim().startsWith('*')) + .join('\n'); + + const violations: string[] = []; + for (const pmType of ['trello', 'jira', 'linear']) { + const re = new RegExp(`pmType\\s*===?\\s*['"\`]${pmType}['"\`]`, 'g'); + const matches = codeOnly.match(re); + if (matches) violations.push(`pmType === '${pmType}' (${matches.length} occurrence(s))`); + } + + expect( + violations, + `${file} contains pmType literal branching: ${violations.join(', ')}. ` + + `The consolidated dispatch path must index the manifest registry, not branch on literal pm-type strings.`, + ).toEqual([]); + }); + } +}); diff --git a/tests/unit/triggers/shared/pm-ack.test.ts b/tests/unit/triggers/shared/pm-ack.test.ts new file mode 100644 index 00000000..6a3a5c3a --- /dev/null +++ b/tests/unit/triggers/shared/pm-ack.test.ts @@ -0,0 +1,61 @@ +/** + * Tests for the shared `postPMAckComment` helper. After plan 017/1, it + * delegates to `dispatchPMAck` from `src/router/pm-ack-dispatch.ts` and + * preserves its existing `string | null` return contract for backward + * compatibility with `src/triggers/github/webhook-handler.ts:maybePostPmAckComment`. + */ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const { mockDispatchPMAck } = vi.hoisted(() => ({ + mockDispatchPMAck: vi.fn(), +})); + +vi.mock('../../../../src/router/pm-ack-dispatch.js', () => ({ + dispatchPMAck: mockDispatchPMAck, +})); + +import { postPMAckComment } from '../../../../src/triggers/shared/pm-ack.js'; + +describe('postPMAckComment (shared)', () => { + beforeEach(() => { + vi.resetAllMocks(); + }); + + it('delegates to dispatchPMAck with the same arg shape', async () => { + mockDispatchPMAck.mockResolvedValue({ commentId: 'abc', message: 'msg' }); + + await postPMAckComment('proj-1', 'item-1', 'linear', 'msg', 'backlog-manager'); + + expect(mockDispatchPMAck).toHaveBeenCalledWith({ + projectId: 'proj-1', + workItemId: 'item-1', + pmType: 'linear', + message: 'msg', + agentType: 'backlog-manager', + }); + }); + + it('returns the unwrapped commentId as a string when dispatch succeeds (string id)', async () => { + mockDispatchPMAck.mockResolvedValue({ commentId: 'comment-123', message: 'msg' }); + + const result = await postPMAckComment('proj-1', 'item-1', 'trello', 'msg'); + + expect(result).toBe('comment-123'); + }); + + it('returns the commentId as a string when dispatch returns a numeric id (JIRA-shaped)', async () => { + mockDispatchPMAck.mockResolvedValue({ commentId: 12345, message: 'msg' }); + + const result = await postPMAckComment('proj-2', 'PROJ-1', 'jira', 'msg'); + + expect(result).toBe('12345'); // String() normalization preserves the existing string|null contract + }); + + it('returns null when dispatch returns undefined (null comment id from underlying client)', async () => { + mockDispatchPMAck.mockResolvedValue(undefined); + + const result = await postPMAckComment('proj-1', 'item-1', 'linear', 'msg'); + + expect(result).toBeNull(); + }); +}); From 2638a378ac7b1dc05c9355824e4c4eee5780547a Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:49:00 +0000 Subject: [PATCH 11/22] chore(plan): 017/2 lock --- ...apacity-gate-pm-scope.md => 2-capacity-gate-pm-scope.md.wip} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/plans/017-router-silent-failure-hardening/{2-capacity-gate-pm-scope.md => 2-capacity-gate-pm-scope.md.wip} (99%) diff --git a/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md b/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.wip similarity index 99% rename from docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md rename to docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.wip index d029119d..912403dd 100644 --- a/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md +++ b/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.wip @@ -6,7 +6,7 @@ plan_slug: capacity-gate-pm-scope level: plan parent_spec: docs/specs/017-router-silent-failure-hardening.md depends_on: [] -status: pending +status: wip --- # 017/2: Pipeline-capacity-gate PM-provider scope From 532dda54a21de46c9a8a69cf8f2a1bd773877419 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:59:25 +0000 Subject: [PATCH 12/22] fix(router): wrap PM-source dispatch in PM-provider scope so capacity gate enforces maxInFlightItems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan 017/2 (capacity-gate-pm-scope). Closes failure mode B from spec 017's 24h log audit on 2026-04-29. Root cause: the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts:33-45` calls `getPMProvider()` to count in-flight items. For every PM `status-changed` trigger this threw `No PMProvider in scope` because the three PM router adapters (`src/router/adapters/{linear,trello,jira}.ts`) wrapped trigger dispatch in their per-PM-type credential AsyncLocalStorage scope (`withLinearCredentials` / `withTrelloCredentials` / `withJiraCredentials`) but NOT in PM-provider scope. The GitHub adapter at `src/router/adapters/github.ts:280` already had both wrappings — the PM router adapters were the outliers. The gate caught the throw and fell through to its conservative branch: `WARN: pipeline-capacity-gate: PM provider unavailable, allowing run` + `return false` (allow). Net effect: `maxInFlightItems` was silently no-op for the entire PM-source path. 32 occurrences/day on prod cascade-router. The gate's whole purpose is preventing the multi-card-into-TODO incident class documented in the file header — and the protection had been disabled since the gate was first introduced. Three changes: 1. New shared helper `withPMScopeForDispatch(project, dispatch)` at `src/router/adapters/_shared.ts`. Resolves the PM provider via `createPMProvider(project)` and wraps `dispatch` in `withPMProvider`. Mirrors the GitHub adapter's correct shape. 2. The three PM router adapters consume the helper inside `dispatchWithCredentials`, layering PM-provider scope on top of their existing per-PM-type credential scope. 3. The capacity gate's "PM provider unavailable" branch flips from `WARN + return false` (allow) to ERROR + Sentry capture under stable tag `pipeline_capacity_gate_no_pm_provider` + `return true` (block). Once the routine path establishes scope (changes 1+2 above), hitting that branch represents a real AsyncLocalStorage scope leak that operators need to investigate. Failing closed is preferable to silently failing open and re-introducing the original incident class. Regression nets: - `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts`: static guard reading each PM router adapter source and asserting it contains either `withPMScopeForDispatch` or `withPMProvider`. A future PM router adapter added without the wrapping fails this guard with a precise file path. Modeled on `trigger-event-consistency.test.ts`. - `tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts`: 4 tests covering happy path (PM provider resolves inside the wrapped callback), return-value passthrough, null dispatch result, and error propagation. - `pipeline-capacity-gate.test.ts`: existing "allows when no PM provider" test inverted to "FAILS CLOSED (blocks) when no PM provider scope" (the headline behavioral change), plus 2 new positive-path regression pins. Side effects on existing trigger tests: three trigger-handler test files (`linear-status-changed.test.ts`, `jira-status-changed.test.ts`, `status-changed.test.ts`) call `shouldBlockForPipelineCapacity` indirectly via the trigger handlers. After the fail-closed flip, those handlers were blocked because the unit tests don't establish PM-provider scope. Each test file gains a `vi.mock('.../pipeline-capacity-gate.js')` passthrough so trigger-logic assertions still run — the gate's behavior is tested in its own dedicated test file. Three PM router adapter test files (`linear.test.ts`, `trello.test.ts`, `jira.test.ts`) gain a similar passthrough mock for `_shared.js:withPMScopeForDispatch` so they don't pull the real PM manifest registry. Doc updates: - `CLAUDE.md` Architecture section gains a "Capacity-gate invariant" paragraph documenting the wrapping requirement, the fail-closed policy, the Sentry tag, and the static guard. - `CHANGELOG.md` entry under spec 017. Full suite: 474 files / 8677 tests passing (+9 from Plan 1's baseline). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + CLAUDE.md | 2 + ...d.wip => 2-capacity-gate-pm-scope.md.done} | 24 ++--- src/router/adapters/_shared.ts | 42 ++++++++ src/router/adapters/jira.ts | 6 +- src/router/adapters/linear.ts | 7 +- src/router/adapters/trello.ts | 8 +- src/triggers/shared/pipeline-capacity-gate.ts | 29 ++++- .../pm-router-adapter-pm-scope.test.ts | 58 ++++++++++ tests/unit/router/adapters/jira.test.ts | 5 + tests/unit/router/adapters/linear.test.ts | 7 ++ tests/unit/router/adapters/trello.test.ts | 5 + .../with-pm-scope-for-dispatch.test.ts | 86 +++++++++++++++ .../unit/triggers/jira-status-changed.test.ts | 7 ++ .../triggers/linear-status-changed.test.ts | 8 ++ .../shared/pipeline-capacity-gate.test.ts | 100 +++++++++++++++--- tests/unit/triggers/status-changed.test.ts | 7 ++ 17 files changed, 372 insertions(+), 30 deletions(-) rename docs/plans/017-router-silent-failure-hardening/{2-capacity-gate-pm-scope.md.wip => 2-capacity-gate-pm-scope.md.done} (91%) create mode 100644 src/router/adapters/_shared.ts create mode 100644 tests/unit/integrations/pm-router-adapter-pm-scope.test.ts create mode 100644 tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index c3723177..0ccb6d94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable user-visible changes to CASCADE are documented here. The format is l ### Changed +- **Pipeline-capacity gate now enforces `maxInFlightItems` for PM `status-changed` triggers** (spec 017, plan 2 of 3). The gate at `src/triggers/shared/pipeline-capacity-gate.ts` is the hard cap on the active pipeline (TODO + IN_PROGRESS + IN_REVIEW work items) introduced after a prior incident where a human moved three cards into TODO simultaneously and three concurrent implementation runs fired against a project pinned to `maxInFlightItems: 1`. The gate calls `getPMProvider()` to count in-flight items, but for every PM `status-changed` trigger the call threw `No PMProvider in scope` because the three PM router adapters (`src/router/adapters/{linear,trello,jira}.ts`) wrapped trigger dispatch in their per-PM-type credential `AsyncLocalStorage` scope but NOT in PM-provider scope (the GitHub adapter at `src/router/adapters/github.ts:280` already had both wrappings). The gate fell through to its conservative branch (`WARN: pipeline-capacity-gate: PM provider unavailable, allowing run` and `return false`) — silently no-op for the only triggers that actually need it. 32 occurrences/day on cascade-router (verified 2026-04-29). The fix introduces a shared helper `withPMScopeForDispatch(project, dispatch)` at `src/router/adapters/_shared.ts` that the three PM router adapters consume, mirroring the GitHub adapter's correct shape. The gate's "PM provider unavailable" branch is converted from `WARN + return false` (allow) to ERROR-level + Sentry capture under stable tag `pipeline_capacity_gate_no_pm_provider` + `return true` (block) — once the routine path establishes scope, hitting that branch is a real `AsyncLocalStorage` scope leak operators need to investigate. A static-guard test at `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` enforces the wrapping invariant per adapter; CLAUDE.md gains a "Capacity-gate invariant" passage in the Architecture section. See [spec 017](docs/specs/017-router-silent-failure-hardening.md). - **PM-ack dispatch consolidation: Linear-based PM-focused agents now post their PM-side ack comment** (spec 017, plan 1 of 3). PM-focused agents (e.g. `backlog-manager`) triggered from a GitHub webhook used to silently skip their PM-side ack on Linear projects: the router-adapter's local `postPMAck` helper had `if (pmType === 'trello')` / `if (pmType === 'jira')` branches but no Linear branch, so Linear-based projects fell through to a `WARN: Unknown PM type for PM-focused agent ack, skipping` and never saw the "🔧 On it" comment that Trello/JIRA projects got (24 silent skips per day on cascade-router, all from `ucho`, verified 2026-04-29). A near-identical helper at `src/triggers/shared/pm-ack.ts` already had the Linear branch — pure parallel-path drift. The fix introduces a single consolidated helper `dispatchPMAck` at `src/router/pm-ack-dispatch.ts` that indexes the manifest registry directly and invokes `manifest.platformClientFactory(projectId).postComment(...)` — no per-PM-type literal branching anywhere on the dispatch surface. Both legacy call sites delegate. The PM manifest conformance harness gains a per-provider `dispatchPMAck reaches this provider without throwing` assertion, and a static-guard test pins "no `pmType === ''` branching" against all three call sites; adding a future PM provider to the registry lands the dispatch path for free. Genuinely-unknown PM types (configuration error: project pinned to a deleted provider) now log at ERROR + capture to Sentry under stable tag `pm_ack_unknown_pm_type` instead of a silent WARN. See [spec 017](docs/specs/017-router-silent-failure-hardening.md). - **Progress-comment lifecycle: post-agent cleanup hook now skips when an in-run gadget already deleted the comment** (spec 017, plan 3 of 3). The post-agent `deleteProgressCommentOnSuccess` hook used to read `sessionState.initialCommentId`, fall back to `result.agentInput.ackCommentId` when session state was empty, and issue a redundant DELETE — but "session state cleared by a gadget" was indistinguishable from "session state never populated", so the fallback fired and re-deleted comments that were already gone. GitHub returned 404 and `WARN: Failed to delete progress comment after agent success` was logged 72 times per day on cascade-router (live audit on 2026-04-29). Adds an explicit `initialCommentIdConsumed: boolean` flag on `SessionStateData`. Both `deleteInitialComment` (gadget-driven) and `clearInitialComment` (sidecar-driven) now set the flag to `true` after disposing of the comment. The post-agent hook checks the flag first and skips the entire deletion path — including the legacy `agentInput.ackCommentId` fallback — when consumed. As defense in depth, `githubClient.deletePRComment` now treats HTTP 404 as success (RFC-7231 idempotency) and logs at DEBUG instead of letting the error bubble as a WARN; other HTTP errors (5xx, 401, network) continue to throw. The legacy fallback to `agentInput.ackCommentId` continues to work for code paths that never populate session state. See [spec 017](docs/specs/017-router-silent-failure-hardening.md). - **PM image delivery: Linear GraphQL fixture + extraction-coverage regression test** (spec 016, plan 3 of 3). Captures a reconstructed Linear `Issue` GraphQL payload at `tests/fixtures/linear-issue-with-screenshot.json` containing extension-less and extensioned inline-pasted images (description + comment bodies) plus formal Attachment records (Slack/GitHub/Sentry link previews) that must NOT be mistaken for inline images. The unit test at `tests/unit/pm/linear/extraction-coverage.test.ts` pins the contract and fails loudly with a specific URL-missing message if Linear ever changes its payload shape in a way that loses inline images. Documents the conclusion in `src/integrations/README.md`: `Issue.description` markdown is canonical for Linear inline images; `Issue.attachments` is the wrong surface (formal Attachment records, not pastes). No production code change — this plan ships the regression net for the contract Plans 1+2 established. See [spec 016](docs/specs/016-pm-image-delivery-reliability.md). diff --git a/CLAUDE.md b/CLAUDE.md index 40ec0bd0..c821a20d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,6 +23,8 @@ Three separate services, **no monolithic server mode**: Flow: `PM/SCM/alerting webhook → Router → Redis → Worker → TriggerRegistry → Agent → Code → PR`. +**Capacity-gate invariant.** Every PM router adapter (`src/router/adapters/{linear,trello,jira}.ts`) must wrap `triggerRegistry.dispatch(ctx)` in PM-provider `AsyncLocalStorage` scope via the shared `withPMScopeForDispatch(fullProject, dispatch)` helper at `src/router/adapters/_shared.ts` — in addition to the per-PM-type credential scope (`withLinearCredentials` / `withTrelloCredentials` / `withJiraCredentials`). Without the PM-provider wrapping, the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts` cannot resolve `getPMProvider()`, **fails closed** under the spec-017 fail-closed policy (blocks the run + ERROR + Sentry capture under tag `pipeline_capacity_gate_no_pm_provider`), and `maxInFlightItems` is silently disabled for the PM-source path. Mirror the GitHub adapter's existing correct shape at `src/router/adapters/github.ts:dispatchWithCredentials`. The static guard at `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` enforces this at CI time — adding a new PM router adapter without the wrapping fails CI with a precise file path. + Integration abstraction lives in `src/integrations/`. For **adding a new PM provider**, see @src/integrations/README.md — PM providers (Trello, JIRA, Linear) use the `PMProviderManifest` registry with a **behavioral conformance harness** (spec 009 — config round-trip, discovery shape, full lifecycle scenario, auth-header provenance, single-entrypoint invariant). Each provider owns its Zod config schema (`src/integrations/pm//config-schema.ts`) as the single source of truth — the central `src/config/schema.ts` imports it. PM adapter method signatures use branded `StateId` / `LabelId` / `ContainerId` from `src/pm/ids.ts` to make state-name-vs-ID confusion a compile error at direct-adapter call sites. All runtime surfaces (router, worker, CLI, dashboard) register integrations through a single entrypoint at `src/integrations/entrypoint.ts`. **Spec 010 follow-ups** added generic `pm.discovery.createLabel` / `createCustomField` mutation endpoints + `currentUser` discovery capability + real shared React components for every `StandardStepKind` under `web/src/components/projects/pm-providers/steps/`. **Spec 011** migrated all three production providers (Trello, JIRA, Linear) onto those shared components, added a 7th `StandardStepKind: custom-field-mapping`, widened `container-pick` / `project-scope` / `webhook-url-display` with optional props, and deleted the three legacy `pm-wizard-{trello,jira,linear}-steps.tsx` files. **Spec 012** migrated each provider's webhook UX (programmatic create for Trello/JIRA, signing-secret + instructions for Linear) into per-provider manifest webhook adapters (Fragment compositions around the shared `WebhookUrlDisplayStep`); deleted the legacy `WebhookStep` + `LinearWebhookInfoPanel` + `useWebhookManagement` + `useLinearWebhookInfo`. Every PM wizard step now renders via the manifest path without exception. A new PM provider writes zero edits to shared orchestration (`pm-wizard.tsx`, `pm-wizard-common-steps.tsx`, `pm-wizard-hooks.ts`); provider-specific UI ships either as `kind: 'custom'` steps or as Fragment compositions inside the provider folder's wizard adapters. SCM (GitHub) and alerting (Sentry) still use the legacy `IntegrationModule` pattern via self-registration in `src/github/register.ts` + `src/sentry/register.ts`. Don't improvise; the README covers both patterns. ## PR checkout (worker) — gotcha diff --git a/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.wip b/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.done similarity index 91% rename from docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.wip rename to docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.done index 912403dd..781a743d 100644 --- a/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.wip +++ b/docs/plans/017-router-silent-failure-hardening/2-capacity-gate-pm-scope.md.done @@ -6,7 +6,7 @@ plan_slug: capacity-gate-pm-scope level: plan parent_spec: docs/specs/017-router-silent-failure-hardening.md depends_on: [] -status: wip +status: done --- # 017/2: Pipeline-capacity-gate PM-provider scope @@ -210,14 +210,14 @@ n/a — all ACs auto-tested. ## Progress -- [ ] AC #1 -- [ ] AC #2 -- [ ] AC #3 -- [ ] AC #4 -- [ ] AC #5 -- [ ] AC #6 -- [ ] AC #7 -- [ ] AC #8 -- [ ] AC #9 -- [ ] AC #10 -- [ ] AC #11 +- [x] AC #1 — `withPMScopeForDispatch(project, dispatch)` exists at `src/router/adapters/_shared.ts`, resolves PM provider via `createPMProvider`, wraps in `withPMProvider` (verified by `tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts`'s 4 tests covering happy path, return passthrough, null result, error propagation). +- [x] AC #2 — Linear, Trello, JIRA router adapters each consume the shared helper inside `dispatchWithCredentials` (verified by per-adapter regression tests + the static guard). +- [x] AC #3 — `getPMProvider()` succeeds inside trigger handlers dispatched from any of the three PM router adapters (verified by the helper's positive-path test, which exercises `getPMProvider()` from inside the wrapped callback). +- [x] AC #4 — Capacity-gate fail-closed branch: ERROR log + Sentry tag `pipeline_capacity_gate_no_pm_provider` + `return true` (verified by `pipeline-capacity-gate.test.ts > FAILS CLOSED (blocks) when no PM provider scope is available`). +- [x] AC #5 — Positive path still returns `isActivePipelineOverCapacity`'s result; non-slot-consuming early-return preserved (verified by 2 new positive-path tests + the existing non-slot-consuming test). +- [x] AC #6 — Static guard `tests/unit/integrations/pm-router-adapter-pm-scope.test.ts` asserts each PM router adapter source contains either `withPMScopeForDispatch` or `withPMProvider`; CI fails when missing. +- [x] AC #7 — CLAUDE.md gains "Capacity-gate invariant" paragraph in the Architecture section. +- [x] AC #8 — All new/modified code has tests (+4 helper, +3 capacity-gate, +3 static guard, plus passthrough mocks added to 3 PM-source trigger tests + 3 PM router adapter tests). +- [x] AC #9 — `npm run typecheck` passes. +- [x] AC #10 — `npm test` (full unit suite) passes (474 files / 8677 tests, +9 from baseline after Plan 1). +- [x] AC #11 — `npm run lint` passes. diff --git a/src/router/adapters/_shared.ts b/src/router/adapters/_shared.ts new file mode 100644 index 00000000..72104d7d --- /dev/null +++ b/src/router/adapters/_shared.ts @@ -0,0 +1,42 @@ +/** + * Shared helpers for router platform adapters. + * + * Spec 017 / plan 2: PM router adapters (Linear/Trello/JIRA) wrap their + * `triggerRegistry.dispatch(ctx)` invocation in `withPMProvider` scope using + * the helper below. Without it, `getPMProvider()` calls inside trigger + * handlers — notably the pipeline-capacity gate at + * `src/triggers/shared/pipeline-capacity-gate.ts` — throw, the gate fails + * closed under spec 017's fail-closed policy, and Sentry captures under + * tag `pipeline_capacity_gate_no_pm_provider`. + * + * Mirrors the GitHub router adapter's existing correct shape at + * `src/router/adapters/github.ts:dispatchWithCredentials` which has wrapped + * dispatch in PM-provider scope since spec 006. + * + * The helper does NOT establish credential scope — that's each adapter's + * concern. PM-provider scope layers on TOP of the credential scope. + */ + +import { withPMProvider } from '../../pm/context.js'; +import { createPMProvider } from '../../pm/index.js'; +import type { ProjectConfig } from '../../types/index.js'; + +/** + * Wrap a dispatch callback in PM-provider AsyncLocalStorage scope so that + * `getPMProvider()` succeeds inside trigger handlers downstream. + * + * Resolves the PMProvider via `createPMProvider(project)` (the legacy + * compatibility adapter that delegates to the manifest registry's + * `pmIntegration.createProvider(project)`) and runs `dispatch` inside + * `withPMProvider(provider, dispatch)`. + * + * Returns whatever `dispatch` returns. Errors thrown by `dispatch` + * propagate unchanged. + */ +export function withPMScopeForDispatch( + project: ProjectConfig, + dispatch: () => Promise, +): Promise { + const provider = createPMProvider(project); + return withPMProvider(provider, dispatch); +} diff --git a/src/router/adapters/jira.ts b/src/router/adapters/jira.ts index 0e8c0544..feb4ff83 100644 --- a/src/router/adapters/jira.ts +++ b/src/router/adapters/jira.ts @@ -19,6 +19,7 @@ import type { AckResult, ParsedWebhookEvent, RouterPlatformAdapter } from '../pl import { resolveJiraCredentials } from '../platformClients/index.js'; import type { CascadeJob, JiraJob } from '../queue.js'; import { sendAcknowledgeReaction } from '../reactions.js'; +import { withPMScopeForDispatch } from './_shared.js'; const PROCESSABLE_EVENTS = [ 'jira:issue_updated', @@ -125,9 +126,12 @@ export class JiraRouterAdapter implements RouterPlatformAdapter { } const ctx: TriggerContext = { project: fullProject, source: 'jira', payload }; + // Wrap dispatch in BOTH credential scope AND PM-provider scope so that + // the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts` + // can resolve `getPMProvider()`. See spec 017 plan 2. return withJiraCredentials( { email: jiraCreds.email, apiToken: jiraCreds.apiToken, baseUrl: jiraCreds.baseUrl }, - () => triggerRegistry.dispatch(ctx), + () => withPMScopeForDispatch(fullProject, () => triggerRegistry.dispatch(ctx)), ); } diff --git a/src/router/adapters/linear.ts b/src/router/adapters/linear.ts index dc0bb3ac..60f2e92f 100644 --- a/src/router/adapters/linear.ts +++ b/src/router/adapters/linear.ts @@ -19,6 +19,7 @@ import { loadProjectConfig, type RouterProjectConfig } from '../config.js'; import type { AckResult, ParsedWebhookEvent, RouterPlatformAdapter } from '../platform-adapter.js'; import { resolveLinearCredentials } from '../platformClients/index.js'; import type { CascadeJob, LinearJob } from '../queue.js'; +import { withPMScopeForDispatch } from './_shared.js'; // ============================================================================ // Processable event combinations (action/type) @@ -236,8 +237,12 @@ export class LinearRouterAdapter implements RouterPlatformAdapter { } const ctx: TriggerContext = { project: fullProject, source: 'linear', payload }; + // Wrap dispatch in BOTH credential scope AND PM-provider scope. + // The PM-provider scope is what the pipeline-capacity gate + // (src/triggers/shared/pipeline-capacity-gate.ts) needs to resolve + // `getPMProvider()`. Without it the gate fails closed (spec 017). return withLinearCredentials({ apiKey: linearCreds.apiKey }, () => - triggerRegistry.dispatch(ctx), + withPMScopeForDispatch(fullProject, () => triggerRegistry.dispatch(ctx)), ); } diff --git a/src/router/adapters/trello.ts b/src/router/adapters/trello.ts index 88a4b8f7..ad7678f2 100644 --- a/src/router/adapters/trello.ts +++ b/src/router/adapters/trello.ts @@ -25,6 +25,7 @@ import { isReadyToProcessLabelAdded, isSelfAuthoredTrelloComment, } from '../trello.js'; +import { withPMScopeForDispatch } from './_shared.js'; export class TrelloRouterAdapter implements RouterPlatformAdapter { readonly type = 'trello' as const; @@ -125,7 +126,12 @@ export class TrelloRouterAdapter implements RouterPlatformAdapter { } const ctx: TriggerContext = { project: fullProject, source: 'trello', payload }; - return withTrelloCredentials(trelloCreds, () => triggerRegistry.dispatch(ctx)); + // Wrap dispatch in BOTH credential scope AND PM-provider scope so that + // the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts` + // can resolve `getPMProvider()`. See spec 017 plan 2. + return withTrelloCredentials(trelloCreds, () => + withPMScopeForDispatch(fullProject, () => triggerRegistry.dispatch(ctx)), + ); } async postAck( diff --git a/src/triggers/shared/pipeline-capacity-gate.ts b/src/triggers/shared/pipeline-capacity-gate.ts index a8138128..fef0295f 100644 --- a/src/triggers/shared/pipeline-capacity-gate.ts +++ b/src/triggers/shared/pipeline-capacity-gate.ts @@ -16,6 +16,7 @@ import { getPMProvider } from '../../pm/context.js'; import type { PMProvider } from '../../pm/types.js'; +import { captureException } from '../../sentry.js'; import type { ProjectConfig } from '../../types/index.js'; import { logger } from '../../utils/logging.js'; import { isActivePipelineOverCapacity } from './backlog-check.js'; @@ -34,14 +35,36 @@ export async function shouldBlockForPipelineCapacity(args: { try { provider = getPMProvider(); } catch (err) { - // No credential scope — conservative: allow. - logger.warn('pipeline-capacity-gate: PM provider unavailable, allowing run', { + // Spec 017 / plan 2: fail closed. + // + // Before plan 2, this branch logged WARN and returned `false` (allow) + // because the PM router adapters dispatched outside PM-provider scope + // — hitting this branch was the routine path for every PM + // `status-changed` trigger. After plan 2 wraps every PM router adapter + // in `withPMScopeForDispatch`, hitting this branch represents a real + // AsyncLocalStorage scope leak that operators need to investigate. + // Failing closed (block + error + Sentry) is preferable to silently + // failing open and re-introducing the original incident class + // (multiple concurrent implementation runs against a project pinned + // to `maxInFlightItems: 1`). + const error = err instanceof Error ? err : new Error(String(err)); + logger.error('pipeline-capacity-gate: PM provider unavailable, blocking run', { source: args.source, projectId: args.project.id, workItemId: args.workItemId, + agentType: args.agentType, error: String(err), }); - return false; + captureException(error, { + tags: { source: 'pipeline_capacity_gate_no_pm_provider' }, + extra: { + projectId: args.project.id, + workItemId: args.workItemId, + agentType: args.agentType, + triggerSource: args.source, + }, + }); + return true; } const result = await isActivePipelineOverCapacity(args.project, provider, { diff --git a/tests/unit/integrations/pm-router-adapter-pm-scope.test.ts b/tests/unit/integrations/pm-router-adapter-pm-scope.test.ts new file mode 100644 index 00000000..5c2ca667 --- /dev/null +++ b/tests/unit/integrations/pm-router-adapter-pm-scope.test.ts @@ -0,0 +1,58 @@ +/** + * Static guard: every PM router adapter must establish PM-provider AsyncLocalStorage + * scope around `triggerRegistry.dispatch(ctx)`. Without this, the pipeline-capacity + * gate at `src/triggers/shared/pipeline-capacity-gate.ts` cannot resolve the + * project's PM provider, fails closed (post-spec 017 plan 2), and Sentry captures + * under tag `pipeline_capacity_gate_no_pm_provider`. Live incident verified + * 2026-04-29: 32 occurrences/day on prod cascade-router. + * + * Compares each PM router adapter source file against the GitHub adapter's + * correct shape (which wraps in `withPMProvider` at + * `src/router/adapters/github.ts:dispatchWithCredentials`). A future PM router + * adapter that omits the wrapping fails this guard with a precise file path. + * + * Modeled on `tests/unit/triggers/trigger-event-consistency.test.ts` — + * static-grep style regression net. + */ +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; + +const ROUTER_ADAPTERS_DIR = join(__dirname, '..', '..', '..', 'src', 'router', 'adapters'); + +// PM router adapters that drive PM `status-changed` triggers. Each must +// establish PM-provider scope before invoking `triggerRegistry.dispatch(ctx)`. +// Adding a new PM router adapter file here is part of the contract: the +// guard fails fast on unregistered adapters too if they end up dispatching +// without scope. +const PM_ROUTER_ADAPTER_FILES = ['linear.ts', 'trello.ts', 'jira.ts']; + +const ACCEPTABLE_WRAPPERS = ['withPMScopeForDispatch', 'withPMProvider']; + +describe('PM router adapter PM-provider scope (static guard)', () => { + for (const filename of PM_ROUTER_ADAPTER_FILES) { + it(`${filename} establishes PM-provider scope around trigger dispatch`, () => { + const path = join(ROUTER_ADAPTERS_DIR, filename); + const src = readFileSync(path, 'utf-8'); + + // Strip line comments and block comments to avoid false positives + // from doc references that mention the wrapper names. + const codeOnly = src + .split('\n') + .filter((line) => !line.trim().startsWith('//') && !line.trim().startsWith('*')) + .join('\n'); + + const found = ACCEPTABLE_WRAPPERS.find((wrapper) => codeOnly.includes(wrapper)); + + expect( + found, + `src/router/adapters/${filename} must wrap trigger dispatch in PM-provider AsyncLocalStorage scope. ` + + `Expected one of: ${ACCEPTABLE_WRAPPERS.join(', ')}. ` + + `Without this wrapping, the pipeline-capacity gate (` + + `src/triggers/shared/pipeline-capacity-gate.ts) cannot resolve the project's PM ` + + `provider, fails closed under the spec-017 fail-closed policy, and Sentry captures ` + + `under tag pipeline_capacity_gate_no_pm_provider.`, + ).toBeTruthy(); + }); + } +}); diff --git a/tests/unit/router/adapters/jira.test.ts b/tests/unit/router/adapters/jira.test.ts index 775c9459..1079678d 100644 --- a/tests/unit/router/adapters/jira.test.ts +++ b/tests/unit/router/adapters/jira.test.ts @@ -41,6 +41,11 @@ vi.mock('../../../../src/utils/runLink.js', () => ({ vi.mock('../../../../src/jira/client.js', () => ({ withJiraCredentials: vi.fn().mockImplementation((_creds: unknown, fn: () => unknown) => fn()), })); +// Spec 017 / plan 2: PM router adapters wrap dispatch in `withPMScopeForDispatch`. +// Mock as passthrough so the existing tests don't pull the real PM manifest registry. +vi.mock('../../../../src/router/adapters/_shared.js', () => ({ + withPMScopeForDispatch: vi.fn().mockImplementation((_p: unknown, fn: () => unknown) => fn()), +})); import { postJiraAck, resolveJiraBotAccountId } from '../../../../src/router/acknowledgments.js'; import { JiraRouterAdapter } from '../../../../src/router/adapters/jira.js'; diff --git a/tests/unit/router/adapters/linear.test.ts b/tests/unit/router/adapters/linear.test.ts index 1af3d623..14890141 100644 --- a/tests/unit/router/adapters/linear.test.ts +++ b/tests/unit/router/adapters/linear.test.ts @@ -32,6 +32,13 @@ vi.mock('../../../../src/utils/runLink.js', () => ({ buildWorkItemRunsLink: vi.fn().mockReturnValue(null), getDashboardUrl: vi.fn().mockReturnValue(null), })); +// Spec 017 / plan 2: PM router adapters wrap dispatch in `withPMScopeForDispatch` +// (PM-provider AsyncLocalStorage scope). Mock the helper as passthrough so the +// existing tests don't pull the real PM manifest registry into the assertion. +vi.mock('../../../../src/router/adapters/_shared.js', () => ({ + withPMScopeForDispatch: vi.fn().mockImplementation((_p: unknown, fn: () => unknown) => fn()), +})); + vi.mock('../../../../src/linear/client.js', () => ({ linearClient: { getIssueProjectId: vi.fn().mockResolvedValue(null), diff --git a/tests/unit/router/adapters/trello.test.ts b/tests/unit/router/adapters/trello.test.ts index 57e2945a..d3a93d82 100644 --- a/tests/unit/router/adapters/trello.test.ts +++ b/tests/unit/router/adapters/trello.test.ts @@ -36,6 +36,11 @@ vi.mock('../../../../src/utils/runLink.js', () => ({ vi.mock('../../../../src/trello/client.js', () => ({ withTrelloCredentials: vi.fn().mockImplementation((_creds: unknown, fn: () => unknown) => fn()), })); +// Spec 017 / plan 2: PM router adapters wrap dispatch in `withPMScopeForDispatch`. +// Mock as passthrough so the existing tests don't pull the real PM manifest registry. +vi.mock('../../../../src/router/adapters/_shared.js', () => ({ + withPMScopeForDispatch: vi.fn().mockImplementation((_p: unknown, fn: () => unknown) => fn()), +})); vi.mock('../../../../src/router/trello.js', () => ({ isAgentLogFilename: vi.fn().mockReturnValue(false), isAgentLogAttachmentUploaded: vi.fn().mockReturnValue(false), diff --git a/tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts b/tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts new file mode 100644 index 00000000..57822d7f --- /dev/null +++ b/tests/unit/router/adapters/with-pm-scope-for-dispatch.test.ts @@ -0,0 +1,86 @@ +/** + * Tests for the shared `withPMScopeForDispatch` helper at + * `src/router/adapters/_shared.ts`. PM router adapters (Linear/Trello/JIRA) + * call this helper to wrap `triggerRegistry.dispatch(ctx)` in PM-provider + * AsyncLocalStorage scope, mirroring the GitHub adapter's existing shape at + * `src/router/adapters/github.ts:withPMProvider(pmProvider, ...)`. + * + * Without this wrapping, `getPMProvider()` calls inside trigger handlers — + * notably the pipeline-capacity gate at + * `src/triggers/shared/pipeline-capacity-gate.ts` — throw, the gate falls + * through to its conservative branch, and the in-flight cap is silently + * disabled for every PM `status-changed` trigger. Verified live on + * 2026-04-29 (32 occurrences/day in prod cascade-router). + */ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const { mockCreatePMProvider } = vi.hoisted(() => ({ + mockCreatePMProvider: vi.fn(), +})); + +// Mock the legacy createPMProvider compatibility adapter — it's the function +// used elsewhere in the router (see github.ts:262) to materialize a PMProvider +// for use inside withPMProvider(). +vi.mock('../../../../src/pm/index.js', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + createPMProvider: mockCreatePMProvider, + }; +}); + +import { getPMProvider } from '../../../../src/pm/context.js'; +import { withPMScopeForDispatch } from '../../../../src/router/adapters/_shared.js'; +import type { ProjectConfig } from '../../../../src/types/index.js'; + +const fakeProject: ProjectConfig = { + id: 'proj-1', + repo: 'org/repo', + pm: { type: 'linear' }, +} as ProjectConfig; + +describe('withPMScopeForDispatch', () => { + beforeEach(() => { + vi.resetAllMocks(); + }); + + it('resolves the project PM provider via createPMProvider and runs dispatch inside withPMProvider scope', async () => { + const fakeProvider = { type: 'linear', __marker: 'fake' }; + mockCreatePMProvider.mockReturnValue(fakeProvider); + + const innerSawProvider = await withPMScopeForDispatch(fakeProject, async () => { + // getPMProvider() must succeed here; returns the same provider instance. + return getPMProvider(); + }); + + expect(mockCreatePMProvider).toHaveBeenCalledWith(fakeProject); + expect(innerSawProvider).toBe(fakeProvider); + }); + + it('returns whatever the dispatch callback returns (preserves TriggerResult passthrough)', async () => { + mockCreatePMProvider.mockReturnValue({ type: 'linear' }); + const expectedResult = { agentType: 'review', agentInput: {} }; + + const result = await withPMScopeForDispatch(fakeProject, async () => expectedResult); + + expect(result).toBe(expectedResult); + }); + + it('returns null when the dispatch callback returns null', async () => { + mockCreatePMProvider.mockReturnValue({ type: 'linear' }); + + const result = await withPMScopeForDispatch(fakeProject, async () => null); + + expect(result).toBeNull(); + }); + + it('propagates errors thrown by the dispatch callback (does not swallow)', async () => { + mockCreatePMProvider.mockReturnValue({ type: 'linear' }); + + await expect( + withPMScopeForDispatch(fakeProject, async () => { + throw new Error('dispatch boom'); + }), + ).rejects.toThrow('dispatch boom'); + }); +}); diff --git a/tests/unit/triggers/jira-status-changed.test.ts b/tests/unit/triggers/jira-status-changed.test.ts index 7fca6c4a..ebee2e9a 100644 --- a/tests/unit/triggers/jira-status-changed.test.ts +++ b/tests/unit/triggers/jira-status-changed.test.ts @@ -10,6 +10,13 @@ vi.mock('../../../src/utils/logging.js', () => ({ logger: mockLogger })); vi.mock('../../../src/triggers/config-resolver.js', () => mockConfigResolverModule); vi.mock('../../../src/triggers/shared/trigger-check.js', () => mockTriggerCheckModule); +// Spec 017 / plan 2: the capacity gate is now fail-closed when no +// PM-provider AsyncLocalStorage scope is in effect (the case in these +// unit tests). Mock as passthrough so trigger-logic assertions still run. +vi.mock('../../../src/triggers/shared/pipeline-capacity-gate.js', () => ({ + shouldBlockForPipelineCapacity: vi.fn().mockResolvedValue(false), +})); + import { JiraStatusChangedTrigger } from '../../../src/triggers/jira/status-changed.js'; import { checkTriggerEnabledWithParams } from '../../../src/triggers/shared/trigger-check.js'; import type { TriggerContext } from '../../../src/triggers/types.js'; diff --git a/tests/unit/triggers/linear-status-changed.test.ts b/tests/unit/triggers/linear-status-changed.test.ts index 6393b5ed..afeb3e83 100644 --- a/tests/unit/triggers/linear-status-changed.test.ts +++ b/tests/unit/triggers/linear-status-changed.test.ts @@ -4,6 +4,14 @@ import { mockLogger, mockTriggerCheckModule } from '../../helpers/sharedMocks.js vi.mock('../../../src/utils/logging.js', () => ({ logger: mockLogger })); vi.mock('../../../src/triggers/shared/trigger-check.js', () => mockTriggerCheckModule); +// Spec 017 / plan 2: the capacity gate is now fail-closed when no +// PM-provider AsyncLocalStorage scope is in effect (which is the case in +// these unit tests). Mock the gate to return false (don't block) so the +// trigger-logic assertions still exercise the paths under test. +vi.mock('../../../src/triggers/shared/pipeline-capacity-gate.js', () => ({ + shouldBlockForPipelineCapacity: vi.fn().mockResolvedValue(false), +})); + const mockGetLinearConfig = vi.fn(); vi.mock('../../../src/pm/config.js', () => ({ getLinearConfig: (...args: unknown[]) => mockGetLinearConfig(...args), diff --git a/tests/unit/triggers/shared/pipeline-capacity-gate.test.ts b/tests/unit/triggers/shared/pipeline-capacity-gate.test.ts index 59502328..e42c2f22 100644 --- a/tests/unit/triggers/shared/pipeline-capacity-gate.test.ts +++ b/tests/unit/triggers/shared/pipeline-capacity-gate.test.ts @@ -1,10 +1,12 @@ -import { describe, expect, it, vi } from 'vitest'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; -const { mockGetPMProvider, mockIsActivePipelineOverCapacity, mockLogger } = vi.hoisted(() => ({ - mockGetPMProvider: vi.fn(), - mockIsActivePipelineOverCapacity: vi.fn(), - mockLogger: { info: vi.fn(), warn: vi.fn(), debug: vi.fn(), error: vi.fn() }, -})); +const { mockGetPMProvider, mockIsActivePipelineOverCapacity, mockLogger, mockCaptureException } = + vi.hoisted(() => ({ + mockGetPMProvider: vi.fn(), + mockIsActivePipelineOverCapacity: vi.fn(), + mockLogger: { info: vi.fn(), warn: vi.fn(), debug: vi.fn(), error: vi.fn() }, + mockCaptureException: vi.fn(), + })); vi.mock('../../../../src/pm/context.js', () => ({ getPMProvider: mockGetPMProvider, @@ -18,11 +20,19 @@ vi.mock('../../../../src/utils/logging.js', () => ({ logger: mockLogger, })); +vi.mock('../../../../src/sentry.js', () => ({ + captureException: mockCaptureException, +})); + import { shouldBlockForPipelineCapacity } from '../../../../src/triggers/shared/pipeline-capacity-gate.js'; import { createMockProject } from '../../../helpers/factories.js'; const project = createMockProject({ maxInFlightItems: 1 }); +beforeEach(() => { + vi.resetAllMocks(); +}); + describe('shouldBlockForPipelineCapacity', () => { it('does not gate non-slot-consuming agent types (review, planning, splitting, backlog-manager)', async () => { for (const agentType of ['review', 'planning', 'splitting', 'backlog-manager', 'debug']) { @@ -90,9 +100,17 @@ describe('shouldBlockForPipelineCapacity', () => { expect(blocked).toBe(false); }); - it('allows (conservatively) when no PM provider scope is available', async () => { + it('FAILS CLOSED (blocks) when no PM provider scope is available; logs ERROR and captures Sentry under tag pipeline_capacity_gate_no_pm_provider', async () => { + // Spec 017 / plan 2: this branch used to log WARN and return false + // (allow). After plan 2 wraps every PM router adapter in PM-provider + // scope, hitting this branch on the routine path is no longer + // expected — it represents a real AsyncLocalStorage scope leak that + // operators need to investigate. Failing closed (block + error + + // Sentry) is preferable to silently failing open and re-introducing + // the original incident class (3+ concurrent implementation runs + // against a `maxInFlightItems: 1` project). mockGetPMProvider.mockImplementation(() => { - throw new Error('no scope'); + throw new Error('No PMProvider in scope'); }); const blocked = await shouldBlockForPipelineCapacity({ @@ -102,10 +120,68 @@ describe('shouldBlockForPipelineCapacity', () => { source: 'jira', }); - expect(blocked).toBe(false); - expect(mockLogger.warn).toHaveBeenCalledWith( - 'pipeline-capacity-gate: PM provider unavailable, allowing run', - expect.objectContaining({ workItemId: 'UA-3' }), + expect(blocked).toBe(true); + expect(mockLogger.error).toHaveBeenCalledWith( + expect.stringMatching(/pipeline-capacity-gate: PM provider unavailable/), + expect.objectContaining({ workItemId: 'UA-3', source: 'jira' }), + ); + expect(mockLogger.warn).not.toHaveBeenCalled(); + expect(mockCaptureException).toHaveBeenCalledWith( + expect.any(Error), + expect.objectContaining({ + tags: expect.objectContaining({ source: 'pipeline_capacity_gate_no_pm_provider' }), + extra: expect.objectContaining({ + projectId: expect.any(String), + workItemId: 'UA-3', + triggerSource: 'jira', + agentType: 'implementation', + }), + }), ); }); + + it('positive path still works after fail-closed conversion: provider in scope and pipeline-over-capacity returns true', async () => { + // Regression pin against the over-capacity branch breaking during the + // fail-closed migration. This duplicates an earlier test's positive + // assertion explicitly to ensure plan 2 doesn't accidentally short- + // circuit the routine path. + mockGetPMProvider.mockReturnValue({ type: 'jira' }); + mockIsActivePipelineOverCapacity.mockResolvedValue({ + overCapacity: true, + reason: 'over-capacity', + inFlightCount: 2, + limit: 1, + }); + + const blocked = await shouldBlockForPipelineCapacity({ + project, + agentType: 'implementation', + workItemId: 'UA-4', + source: 'jira', + }); + + expect(blocked).toBe(true); + expect(mockCaptureException).not.toHaveBeenCalled(); + }); + + it('positive path: provider in scope, pipeline below capacity returns false', async () => { + // Companion regression pin to the over-capacity test above. + mockGetPMProvider.mockReturnValue({ type: 'jira' }); + mockIsActivePipelineOverCapacity.mockResolvedValue({ + overCapacity: false, + reason: undefined, + inFlightCount: 0, + limit: 1, + }); + + const blocked = await shouldBlockForPipelineCapacity({ + project, + agentType: 'implementation', + workItemId: 'UA-5', + source: 'jira', + }); + + expect(blocked).toBe(false); + expect(mockCaptureException).not.toHaveBeenCalled(); + }); }); diff --git a/tests/unit/triggers/status-changed.test.ts b/tests/unit/triggers/status-changed.test.ts index 406fbd18..f4104f53 100644 --- a/tests/unit/triggers/status-changed.test.ts +++ b/tests/unit/triggers/status-changed.test.ts @@ -15,6 +15,13 @@ vi.mock('../../../src/utils/logging.js', () => ({ logger: mockLogger })); vi.mock('../../../src/triggers/config-resolver.js', () => mockConfigResolverModule); vi.mock('../../../src/triggers/shared/trigger-check.js', () => mockTriggerCheckModule); +// Spec 017 / plan 2: the capacity gate is now fail-closed when no +// PM-provider AsyncLocalStorage scope is in effect (the case in these +// unit tests). Mock as passthrough so trigger-logic assertions still run. +vi.mock('../../../src/triggers/shared/pipeline-capacity-gate.js', () => ({ + shouldBlockForPipelineCapacity: vi.fn().mockResolvedValue(false), +})); + // Mocks required for PM integration registration (pm/index.js side-effect) vi.mock('../../../src/config/provider.js', () => mockConfigProvider); vi.mock('../../../src/trello/client.js', () => mockTrelloClientModule); From b03fb8e4be956bd0aeadc800bb4b8ce8f189b7e4 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 10:59:53 +0000 Subject: [PATCH 13/22] =?UTF-8?q?chore(spec):=20017=20done=20=E2=80=94=20a?= =?UTF-8?q?ll=20three=20plans=20complete=20(router-side=20silent-failure?= =?UTF-8?q?=20hardening)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...hardening.md => 017-router-silent-failure-hardening.md.done} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/specs/{017-router-silent-failure-hardening.md => 017-router-silent-failure-hardening.md.done} (99%) diff --git a/docs/specs/017-router-silent-failure-hardening.md b/docs/specs/017-router-silent-failure-hardening.md.done similarity index 99% rename from docs/specs/017-router-silent-failure-hardening.md rename to docs/specs/017-router-silent-failure-hardening.md.done index a543a43a..9619ff83 100644 --- a/docs/specs/017-router-silent-failure-hardening.md +++ b/docs/specs/017-router-silent-failure-hardening.md.done @@ -4,7 +4,7 @@ slug: router-silent-failure-hardening level: spec title: Router-side silent-failure hardening created: 2026-04-29 -status: draft +status: done --- # 017: Router-side silent-failure hardening From a080cd11a6dc17b1189e433941af286eda71585e Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 11:24:56 +0000 Subject: [PATCH 14/22] fix(integration-tests): wrap implementation-trigger handle() in withPMProvider scope + plan 2 polish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI integration-tests caught a regression I missed locally because lefthook's pre-push only runs `npm run test:fast` (unit-only), not the full integration suite. Four integration tests asserted `result?.agentType === 'implementation'` but received `undefined` after plan 2's fail-closed flip, because they call real trigger `.handle(ctx)` directly without establishing PM-provider scope — the same class of breakage I fixed in unit tests with vi.mock passthroughs. Failing tests: - tests/integration/trigger-registry.test.ts:323 (TrelloStatusChangedTodoTrigger.handle) - tests/integration/trigger-registry.test.ts:591 (registry.dispatch with implementation trigger) - tests/integration/pm-provider-switching.test.ts:258 (Trello implementation dispatch) - tests/integration/pm-provider-switching.test.ts:293 (JIRA implementation dispatch) Integration tests don't mock the gate (by design), so the right fix is to mirror production wrapping: each call site now wraps in `withPMProvider(createPMProvider(project), () => trigger.handle(ctx))` — which is exactly what `withPMScopeForDispatch` does in the PM router adapters. The synthetic in-test trigger at trigger-registry.test.ts:117 doesn't call the gate (it returns a hard-coded TriggerResult), so no scope wrap needed there. Same for any non-implementation agent (planning / splitting / etc.) since `SLOT_CONSUMING_AGENTS` only contains 'implementation'. Also applies polish from local-review on the spec 017 work: 1. Reworded stale comment in `src/gadgets/sessionState.ts:186` — the comment claimed clearing `initialCommentId` made the post-agent callback "see null and short-circuit", but post-plan-3 the callback's actual gate is the `initialCommentIdConsumed` flag set on the line below. Comment now describes both the eager-clear AND the consumed-flag gate together. 2. Trimmed redundant 3-line wrapping comments to zero lines across the three PM router adapters (`linear.ts`, `trello.ts`, `jira.ts`). The shared helper's name (`withPMScopeForDispatch`) is self-documenting and CLAUDE.md's "Capacity-gate invariant" paragraph carries the rationale. Per CLAUDE.md "Default to writing no comments" guidance. 3. Tightened conformance harness assertion at `pm-conformance.test.ts`'s `dispatchPMAck reaches this provider without throwing` test. Previously the only `expect(...)` was conditional (`if (result !== undefined)`), so when result was undefined the test passed with zero assertions executed — a "tests that would still pass if the code under test were deleted" anti-pattern. Now uses `expect.assertions(1)` and asserts the result is either undefined OR has the expected AckResult shape, with a clear failure message naming the provider. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gadgets/sessionState.ts | 12 +++--- src/router/adapters/jira.ts | 3 -- src/router/adapters/linear.ts | 4 -- src/router/adapters/trello.ts | 3 -- .../integration/pm-provider-switching.test.ts | 13 ++++++- tests/integration/trigger-registry.test.ts | 38 ++++++++++++++----- .../unit/integrations/pm-conformance.test.ts | 32 ++++++++-------- 7 files changed, 63 insertions(+), 42 deletions(-) diff --git a/src/gadgets/sessionState.ts b/src/gadgets/sessionState.ts index bb17390d..a43c31fb 100644 --- a/src/gadgets/sessionState.ts +++ b/src/gadgets/sessionState.ts @@ -183,16 +183,18 @@ export class SessionState { const commentId = this.state.initialCommentId; if (!commentId) return; - // Clear state first so the post-agent callback sees null and short-circuits + // Clear the id eagerly so concurrent reads can't observe a stale value. + // The post-agent callback's actual gate is the `initialCommentIdConsumed` + // flag set below — once that's true, the callback's legacy fallback to + // `agentInput.ackCommentId` is also short-circuited. this.state.initialCommentId = null; try { const { githubClient } = await import('../github/client.js'); await githubClient.deletePRComment(owner, repo, commentId); - // Mark consumed so the post-agent callback's legacy fallback to - // `agentInput.ackCommentId` does not re-issue a DELETE for the - // same id. `deletePRComment` swallows 404 internally, so reaching - // here without throwing covers both 200/204 and 404 outcomes. + // `deletePRComment` swallows 404 internally, so reaching here without + // throwing covers both 200/204 (we deleted) and 404 (someone else + // already did) outcomes — both mean the comment is gone. this.state.initialCommentIdConsumed = true; } catch { // Best-effort: restore the id so post-agent callback can retry. diff --git a/src/router/adapters/jira.ts b/src/router/adapters/jira.ts index feb4ff83..91c7eea4 100644 --- a/src/router/adapters/jira.ts +++ b/src/router/adapters/jira.ts @@ -126,9 +126,6 @@ export class JiraRouterAdapter implements RouterPlatformAdapter { } const ctx: TriggerContext = { project: fullProject, source: 'jira', payload }; - // Wrap dispatch in BOTH credential scope AND PM-provider scope so that - // the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts` - // can resolve `getPMProvider()`. See spec 017 plan 2. return withJiraCredentials( { email: jiraCreds.email, apiToken: jiraCreds.apiToken, baseUrl: jiraCreds.baseUrl }, () => withPMScopeForDispatch(fullProject, () => triggerRegistry.dispatch(ctx)), diff --git a/src/router/adapters/linear.ts b/src/router/adapters/linear.ts index 60f2e92f..e80f68d3 100644 --- a/src/router/adapters/linear.ts +++ b/src/router/adapters/linear.ts @@ -237,10 +237,6 @@ export class LinearRouterAdapter implements RouterPlatformAdapter { } const ctx: TriggerContext = { project: fullProject, source: 'linear', payload }; - // Wrap dispatch in BOTH credential scope AND PM-provider scope. - // The PM-provider scope is what the pipeline-capacity gate - // (src/triggers/shared/pipeline-capacity-gate.ts) needs to resolve - // `getPMProvider()`. Without it the gate fails closed (spec 017). return withLinearCredentials({ apiKey: linearCreds.apiKey }, () => withPMScopeForDispatch(fullProject, () => triggerRegistry.dispatch(ctx)), ); diff --git a/src/router/adapters/trello.ts b/src/router/adapters/trello.ts index ad7678f2..35b92342 100644 --- a/src/router/adapters/trello.ts +++ b/src/router/adapters/trello.ts @@ -126,9 +126,6 @@ export class TrelloRouterAdapter implements RouterPlatformAdapter { } const ctx: TriggerContext = { project: fullProject, source: 'trello', payload }; - // Wrap dispatch in BOTH credential scope AND PM-provider scope so that - // the pipeline-capacity gate at `src/triggers/shared/pipeline-capacity-gate.ts` - // can resolve `getPMProvider()`. See spec 017 plan 2. return withTrelloCredentials(trelloCreds, () => withPMScopeForDispatch(fullProject, () => triggerRegistry.dispatch(ctx)), ); diff --git a/tests/integration/pm-provider-switching.test.ts b/tests/integration/pm-provider-switching.test.ts index 3c119c9a..ee6906cf 100644 --- a/tests/integration/pm-provider-switching.test.ts +++ b/tests/integration/pm-provider-switching.test.ts @@ -19,6 +19,7 @@ import { getIntegrationByProjectAndCategory, upsertProjectIntegration, } from '../../src/db/repositories/settingsRepository.js'; +import { withPMProvider } from '../../src/pm/context.js'; import { createPMProvider } from '../../src/pm/index.js'; import { pmRegistry } from '../../src/pm/registry.js'; import { JiraStatusChangedTrigger } from '../../src/triggers/jira/status-changed.js'; @@ -254,7 +255,12 @@ describe('PM Provider Switching (integration)', () => { }; expect(TrelloStatusChangedTodoTrigger.matches(ctx)).toBe(true); - const result = await TrelloStatusChangedTodoTrigger.handle(ctx); + // Spec 017 / plan 2: capacity gate is fail-closed without PM scope. + // Mirror production wrapping (`withPMScopeForDispatch`) so the + // implementation-gating branch finds a provider and proceeds. + const result = await withPMProvider(createPMProvider(assertFound(project)), () => + TrelloStatusChangedTodoTrigger.handle(ctx), + ); expect(result?.agentType).toBe('implementation'); }); }); @@ -289,7 +295,10 @@ describe('PM Provider Switching (integration)', () => { }; expect(trigger.matches(ctx)).toBe(true); - const result = await trigger.handle(ctx); + // Spec 017 / plan 2: PM scope wrap so the capacity gate finds a provider. + const result = await withPMProvider(createPMProvider(assertFound(project)), () => + trigger.handle(ctx), + ); expect(result?.agentType).toBe('implementation'); expect(result?.workItemId).toBe('IMPL-1'); }); diff --git a/tests/integration/trigger-registry.test.ts b/tests/integration/trigger-registry.test.ts index 09bc2a56..2f2aba96 100644 --- a/tests/integration/trigger-registry.test.ts +++ b/tests/integration/trigger-registry.test.ts @@ -6,10 +6,17 @@ */ import { beforeAll, beforeEach, describe, expect, it } from 'vitest'; + +// Bootstrap PM manifest registry so createPMProvider can resolve providers +// inside withPMProvider scope wraps below. See spec 017 / plan 2. +import '../../src/integrations/pm/index.js'; + import { findProjectByBoardIdFromDb, findProjectByRepoFromDb, } from '../../src/db/repositories/configRepository.js'; +import { withPMProvider } from '../../src/pm/context.js'; +import { createPMProvider } from '../../src/pm/index.js'; import { createTriggerRegistry } from '../../src/triggers/registry.js'; import { ReadyToProcessLabelTrigger } from '../../src/triggers/trello/label-added.js'; import { @@ -319,7 +326,14 @@ describe('Trigger Registry (integration)', () => { }), }; - const result = await TrelloStatusChangedTodoTrigger.handle(ctx); + // Spec 017 / plan 2: the pipeline-capacity gate is fail-closed when no + // PM-provider AsyncLocalStorage scope is in effect. In production the + // PM router adapters wrap dispatch in `withPMScopeForDispatch`; this + // integration test mirrors that wrapping so the gate's + // implementation-only branch finds a provider and proceeds. + const result = await withPMProvider(createPMProvider(assertFound(project)), () => + TrelloStatusChangedTodoTrigger.handle(ctx), + ); expect(result?.agentType).toBe('implementation'); expect(result?.agentInput.workItemId).toBe('card-xyz'); expect(result?.workItemId).toBe('card-xyz'); @@ -578,16 +592,20 @@ describe('Trigger Registry (integration)', () => { const project = await findProjectByBoardIdFromDb('board-123'); expect(project).toBeDefined(); - // Move to todo — should trigger implementation - const todoResult = await registry.dispatch({ - project: assertFound(project), - source: 'trello', - payload: makeTrelloCardMovedPayload({ - cardId: 'card-todo', - listAfterId: 'list-todo-123', - listBeforeId: 'list-plan-456', + // Move to todo — should trigger implementation. Spec 017 / plan 2: + // wrap in PM-provider scope so the capacity gate (fail-closed) finds + // a provider; mirrors production router-adapter wrapping. + const todoResult = await withPMProvider(createPMProvider(assertFound(project)), () => + registry.dispatch({ + project: assertFound(project), + source: 'trello', + payload: makeTrelloCardMovedPayload({ + cardId: 'card-todo', + listAfterId: 'list-todo-123', + listBeforeId: 'list-plan-456', + }), }), - }); + ); expect(todoResult?.agentType).toBe('implementation'); // Move to splitting — should trigger splitting diff --git a/tests/unit/integrations/pm-conformance.test.ts b/tests/unit/integrations/pm-conformance.test.ts index 5b5534b7..ef2db5a9 100644 --- a/tests/unit/integrations/pm-conformance.test.ts +++ b/tests/unit/integrations/pm-conformance.test.ts @@ -162,14 +162,15 @@ describe('PM provider conformance (every registered provider)', () => { // PM-side ack because the github router adapter's local `postPMAck` // helper had only Trello + JIRA branches (no `linear`). // - // Strategy: dispatch against the real registry. `postComment` may - // resolve credentials and return null when none are configured (the - // expected outcome in this test environment). What is NOT acceptable - // is the dispatch helper throwing or hitting its unknown-PM-type - // Sentry path — both indicate the provider is unreachable from the - // dispatch surface. We therefore assert the result is either undefined - // or has the AckResult shape, and that no exception propagates. + // Strategy: dispatch against the real registry and assert no exception + // propagates. `postComment` may legitimately return null in this test + // environment when credentials aren't seeded, so the result can be + // either `undefined` (null id) or an AckResult-shaped object — both + // outcomes prove the provider is reachable. What's NOT acceptable is + // dispatch throwing or hitting its unknown-PM-type Sentry path; both + // indicate the provider is unreachable from the dispatch surface. it('dispatchPMAck reaches this provider without throwing', async () => { + expect.assertions(1); const { dispatchPMAck } = await import('../../../src/router/pm-ack-dispatch.js'); const result = await dispatchPMAck({ @@ -180,14 +181,15 @@ describe('PM provider conformance (every registered provider)', () => { agentType: 'backlog-manager', }); - if (result !== undefined) { - expect(result).toEqual( - expect.objectContaining({ - commentId: expect.anything(), - message: 'conformance check', - }), - ); - } + expect( + result === undefined || + (typeof result === 'object' && + result !== null && + result.message === 'conformance check' && + result.commentId !== undefined), + `dispatchPMAck must reach provider '${id}' and return undefined or a well-shaped AckResult; ` + + `got ${JSON.stringify(result)}`, + ).toBe(true); }); it('pmIntegration is wired (type matches id)', () => { From 8e68ff99a78f3d9901aa5df0f6275185596b031b Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Wed, 29 Apr 2026 11:36:04 +0000 Subject: [PATCH 15/22] fix(router): replace in-memory PM coalesce window with BullMQ delayed-job coalescing --- CLAUDE.md | 2 +- src/pm/coalesce-config.ts | 16 ++ src/pm/create-coalesce-window.ts | 95 --------- src/router/queue.ts | 53 +++++ src/router/webhook-processor.ts | 86 +++++--- src/triggers/jira/status-changed.ts | 1 - src/triggers/linear/status-changed.ts | 1 - src/types/index.ts | 12 +- src/worker-entry.ts | 83 +++++++- tests/integration/coalesce-bullmq.test.ts | 145 +++++++++++++ tests/unit/pm/create-coalesce-window.test.ts | 77 ------- tests/unit/router/queue.test.ts | 124 ++++++++++++ tests/unit/router/webhook-processor.test.ts | 190 ++++++++++++++---- .../unit/triggers/jira-status-changed.test.ts | 8 +- .../triggers/linear-status-changed.test.ts | 8 +- 15 files changed, 644 insertions(+), 257 deletions(-) create mode 100644 src/pm/coalesce-config.ts delete mode 100644 src/pm/create-coalesce-window.ts create mode 100644 tests/integration/coalesce-bullmq.test.ts delete mode 100644 tests/unit/pm/create-coalesce-window.test.ts create mode 100644 tests/unit/router/queue.test.ts diff --git a/CLAUDE.md b/CLAUDE.md index 40ec0bd0..078883df 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -173,7 +173,7 @@ Optional: - `CREDENTIAL_MASTER_KEY` — 64-char hex (AES-256 key) to encrypt project credentials at rest. Without it, credentials are stored as plaintext; both modes coexist. - `GITHUB_WEBHOOK_SECRET` — opt-in HMAC verification; store as the `webhook_secret` role on the GitHub SCM integration. - `SENTRY_DSN`, `SENTRY_ENVIRONMENT`, `SENTRY_RELEASE`, `SENTRY_TRACES_SAMPLE_RATE` — observability. -- `PM_CREATE_COALESCE_WINDOW_MS` — window (ms) the router waits after a PM `pm:status-changed` create trigger before enqueuing, so a follow-up `update` (same `${projectId}:${workItemId}`) can supersede it. Defaults to `2000`; `0` disables. Fixes JIRA's double-fire when an issue is created in a non-default workflow column (JIRA emits `issue_created` at the initial status, then `issue_updated` transitioning to the target). +- `PM_COALESCE_WINDOW_MS` — settle window (ms) for BullMQ delayed-job coalescing on `pm:status-changed` events. Any dispatch for the same `${projectId}:${workItemId}` within the window supersedes the prior pending dispatch, across agent types. Ack comment is deferred to job fire time to avoid orphaned comments on supersede. Defaults to `10000` (10 s); `0` disables. Fixes JIRA's double-fire when an issue is created in a non-default workflow column. The legacy name `PM_CREATE_COALESCE_WINDOW_MS` is still accepted as a fallback. **Project credentials (GitHub tokens, Trello/JIRA/Linear keys, LLM API keys) live in the `project_credentials` table.** The DB is the **sole source of truth** — there is no env var fallback for project-scoped secrets. diff --git a/src/pm/coalesce-config.ts b/src/pm/coalesce-config.ts new file mode 100644 index 00000000..c7ad6dde --- /dev/null +++ b/src/pm/coalesce-config.ts @@ -0,0 +1,16 @@ +/** + * Coalesce window configuration for PM status-change webhook dispatches. + * + * Reads `PM_COALESCE_WINDOW_MS` (or the legacy `PM_CREATE_COALESCE_WINDOW_MS` + * for backward compatibility). Default: 10 000 ms (10 s). + * + * Setting the env var to `0` disables coalescing entirely — all PM events are + * dispatched immediately without any delay-based deduplication. + */ +export function getCoalesceWindowMs(): number { + const raw = process.env.PM_COALESCE_WINDOW_MS ?? process.env.PM_CREATE_COALESCE_WINDOW_MS; + if (raw === undefined) return 10_000; + const n = Number.parseInt(raw, 10); + if (!Number.isFinite(n) || n < 0) return 10_000; + return n; +} diff --git a/src/pm/create-coalesce-window.ts b/src/pm/create-coalesce-window.ts deleted file mode 100644 index c5ab08b0..00000000 --- a/src/pm/create-coalesce-window.ts +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Short-window coalescing for PM create→update webhook sequences. - * - * Problem: JIRA emits two webhooks when a user creates an issue in a non-default - * workflow column — `jira:issue_created` (with the workflow's initial status) - * followed ~hundreds of ms later by `jira:issue_updated` (transitioning to the - * target column). Without coalescing, both webhooks fire different agents on - * the same work item. - * - * This module lets a create trigger register a pending entry keyed by - * `${projectId}:${workItemId}`. An incoming update trigger for the same key - * clears the entry, superseding the create. If the window elapses with no - * update, the create proceeds normally. - * - * In-memory state is sufficient — a router restart during the ~2s window - * means the pending create is lost, but the update webhook (which arrives - * independently) will still fire. - */ - -type PendingEntry = { - timer: ReturnType; - resolve: (outcome: 'proceed' | 'superseded') => void; -}; - -const pending = new Map(); - -/** - * Register a pending create for the given key. Returns a promise that resolves - * after `ttlMs` with `'proceed'` if still pending, or earlier with - * `'superseded'` if `clearPendingCreate(key)` is called or another - * `registerPendingCreate(key, …)` supersedes it. - * - * `ttlMs === 0` resolves immediately to `'proceed'` (coalescing disabled). - */ -export function registerPendingCreate( - key: string, - ttlMs: number, -): Promise<'proceed' | 'superseded'> { - if (ttlMs <= 0) { - return Promise.resolve('proceed'); - } - - // Supersede any existing entry for the same key. - const existing = pending.get(key); - if (existing) { - clearTimeout(existing.timer); - existing.resolve('superseded'); - pending.delete(key); - } - - return new Promise((resolve) => { - const timer = setTimeout(() => { - const entry = pending.get(key); - if (entry && entry.resolve === resolve) { - pending.delete(key); - } - resolve('proceed'); - }, ttlMs); - pending.set(key, { timer, resolve }); - }); -} - -/** - * Clear a pending create, causing its registration promise to resolve with - * `'superseded'`. No-op if no entry exists for the key. - */ -export function clearPendingCreate(key: string): void { - const entry = pending.get(key); - if (!entry) return; - clearTimeout(entry.timer); - pending.delete(key); - entry.resolve('superseded'); -} - -/** - * Test-only: drop all pending entries without resolving their promises. - * Used by unit tests between cases to ensure isolation. - */ -export function __resetCoalesceWindowForTests(): void { - for (const entry of pending.values()) { - clearTimeout(entry.timer); - } - pending.clear(); -} - -/** - * Read the configured window duration in milliseconds. `0` disables coalescing. - */ -export function getCoalesceWindowMs(): number { - const raw = process.env.PM_CREATE_COALESCE_WINDOW_MS; - if (raw === undefined) return 2000; - const n = Number.parseInt(raw, 10); - if (!Number.isFinite(n) || n < 0) return 2000; - return n; -} diff --git a/src/router/queue.ts b/src/router/queue.ts index 3d1ea39a..0742e55e 100644 --- a/src/router/queue.ts +++ b/src/router/queue.ts @@ -21,6 +21,10 @@ export interface TrelloJob { receivedAt: string; ackCommentId?: string; triggerResult?: TriggerResult; + /** When true, the worker must post the ack comment before processing (deferred ack). */ + pendingAck?: boolean; + /** Pre-generated ack message text for deferred ack posting. */ + ackMessage?: string; } export interface GitHubJob { @@ -45,6 +49,10 @@ export interface JiraJob { receivedAt: string; ackCommentId?: string; triggerResult?: TriggerResult; + /** When true, the worker must post the ack comment before processing (deferred ack). */ + pendingAck?: boolean; + /** Pre-generated ack message text for deferred ack posting. */ + ackMessage?: string; } export interface SentryJob { @@ -68,6 +76,10 @@ export interface LinearJob { receivedAt: string; ackCommentId?: string; triggerResult?: TriggerResult; + /** When true, the worker must post the ack comment before processing (deferred ack). */ + pendingAck?: boolean; + /** Pre-generated ack message text for deferred ack posting. */ + ackMessage?: string; } export type CascadeJob = TrelloJob | GitHubJob | JiraJob | SentryJob | LinearJob; @@ -110,6 +122,47 @@ export async function addJob(job: CascadeJob): Promise { return result.id ?? jobId; } +export interface ScheduleCoalescedJobResult { + jobId: string; + superseded: boolean; +} + +/** + * Schedule a PM job as a BullMQ delayed job keyed by `coalesceKey`. + * + * If a delayed/waiting job with the same key already exists it is removed + * before the new job is added, superseding the previous dispatch. Active + * (already running) jobs are left untouched; `superseded` is `false` in that + * case. + * + * This replaces the in-memory `create-coalesce-window.ts` mechanism with a + * durable, per-key deduplication that coalesces across any agent types for + * the same `${projectId}:${workItemId}` within the settle window. + */ +export async function scheduleCoalescedJob( + job: CascadeJob, + coalesceKey: string, + delayMs: number, +): Promise { + const jobId = `coalesce:${coalesceKey}`; + let superseded = false; + + // Remove any existing delayed/waiting job with the same key so the new + // job supersedes it. Active jobs are left alone — they are already running. + const existing = await jobQueue.getJob(jobId); + if (existing) { + const state = await existing.getState(); + if (state === 'delayed' || state === 'waiting') { + await existing.remove(); + superseded = true; + } + } + + await jobQueue.add(job.type, job, { jobId, delay: delayMs }); + logger.info('Coalesced job scheduled', { jobId, coalesceKey, delayMs, superseded }); + return { jobId, superseded }; +} + // Get queue stats export async function getQueueStats() { const [waiting, active, completed, failed] = await Promise.all([ diff --git a/src/router/webhook-processor.ts b/src/router/webhook-processor.ts index d2c9d486..9094ca2d 100644 --- a/src/router/webhook-processor.ts +++ b/src/router/webhook-processor.ts @@ -10,11 +10,7 @@ * from `pm/webhook-handler.ts` but for the router (enqueue-only) path. */ -import { - clearPendingCreate, - getCoalesceWindowMs, - registerPendingCreate, -} from '../pm/create-coalesce-window.js'; +import { getCoalesceWindowMs } from '../pm/coalesce-config.js'; import { captureException } from '../sentry.js'; import type { TriggerRegistry } from '../triggers/registry.js'; import { logger } from '../utils/logging.js'; @@ -26,7 +22,7 @@ import { } from './agent-type-lock.js'; import { classifyLockState } from './lock-state-classifier.js'; import type { RouterPlatformAdapter } from './platform-adapter.js'; -import { addJob } from './queue.js'; +import { addJob, scheduleCoalescedJob } from './queue.js'; import { isWorkItemLocked, markWorkItemEnqueued } from './work-item-lock.js'; export interface ProcessRouterWebhookResult { @@ -142,30 +138,74 @@ export async function processRouterWebhook( projectId: project.id, }); - // Step 7b: Coalesce PM create→update sequences. JIRA emits two webhooks when - // a user creates an issue in a non-default workflow column (initial status, - // then transition); without this, both fire different agents on the same - // work item. A 'create' trigger waits the coalesce window; an 'update' - // trigger arriving for the same key within the window supersedes it. - if (result.coalesceKey) { - if (result.coalesceRole === 'update') { - clearPendingCreate(result.coalesceKey); - } else if (result.coalesceRole === 'create') { - const windowMs = getCoalesceWindowMs(); - const outcome = await registerPendingCreate(result.coalesceKey, windowMs); - if (outcome === 'superseded') { - logger.info(`${adapter.type} create trigger superseded by follow-up update`, { - agentType: result.agentType, + // Step 7b: BullMQ delayed-job coalescing for PM status-change sequences. + // + // Any dispatch for the same coalesceKey (${projectId}:${workItemId}) within + // the settle window supersedes the prior pending dispatch — regardless of + // agent type or whether the event is a create vs. update. The ack comment + // is deferred to job fire time (pendingAck=true) so no orphaned ack comment + // is left behind when a job is superseded. + if (result.coalesceKey && result.agentType) { + const windowMs = getCoalesceWindowMs(); + if (windowMs > 0) { + // Build the job without ack info (ack will be posted at fire time). + const job = adapter.buildJob(event, payload, project, result, undefined); + + // Attach the deferred-ack marker and a pre-generated message so the + // worker does not need to re-derive the context. + if (job.type === 'trello' || job.type === 'jira' || job.type === 'linear') { + job.pendingAck = true; + // Use workItemTitle as context if available; generateAckMessage + // already has its own LLM-backed fallback path via the adapter. + job.ackMessage = result.workItemTitle ?? undefined; + } + + // Schedule as a delayed BullMQ job; supersedes any prior pending job + // with the same key so only the latest event fires. + try { + const { superseded } = await scheduleCoalescedJob(job, result.coalesceKey, windowMs); + if (superseded) { + logger.info(`${adapter.type} coalesced dispatch superseded prior pending job`, { + agentType: result.agentType, + workItemId: result.workItemId, + projectId: project.id, + coalesceKey: result.coalesceKey, + }); + } else { + logger.info(`${adapter.type} coalesced dispatch scheduled`, { + agentType: result.agentType, + workItemId: result.workItemId, + projectId: project.id, + coalesceKey: result.coalesceKey, + delayMs: windowMs, + }); + } + } catch (err) { + result.onBlocked?.(); + logger.error(`Failed to schedule coalesced ${adapter.type} job`, { + error: String(err), + coalesceKey: result.coalesceKey, workItemId: result.workItemId, - projectId: project.id, }); - result.onBlocked?.(); return { shouldProcess: true, projectId: project.id, - decisionReason: 'Create trigger superseded by follow-up update (coalesce window)', + decisionReason: 'Failed to schedule coalesced job to Redis', }; } + + // Mark locks exactly as the non-coalesced path does. + if (result.workItemId) { + markWorkItemEnqueued(project.id, result.workItemId, result.agentType); + } + markRecentlyDispatched(project.id, result.agentType, result.workItemId); + markAgentTypeEnqueued(project.id, result.agentType); + + return { + shouldProcess: true, + projectId: project.id, + decisionReason: `Coalesced dispatch scheduled: ${result.agentType} agent for work item ${result.workItemId ?? '(unknown)'}`, + }; } } diff --git a/src/triggers/jira/status-changed.ts b/src/triggers/jira/status-changed.ts index efaafc79..0bbd72aa 100644 --- a/src/triggers/jira/status-changed.ts +++ b/src/triggers/jira/status-changed.ts @@ -160,7 +160,6 @@ export class JiraStatusChangedTrigger implements TriggerHandler { workItemUrl, workItemTitle, coalesceKey: `${ctx.project.id}:${issueKey}`, - coalesceRole: isCreate ? 'create' : 'update', }; } } diff --git a/src/triggers/linear/status-changed.ts b/src/triggers/linear/status-changed.ts index 47e7d8d5..26af52f5 100644 --- a/src/triggers/linear/status-changed.ts +++ b/src/triggers/linear/status-changed.ts @@ -152,7 +152,6 @@ export class LinearStatusChangedTrigger implements TriggerHandler { workItemUrl, workItemTitle, coalesceKey: `${ctx.project.id}:${workItemId}`, - coalesceRole: isCreate ? 'create' : 'update', }; } } diff --git a/src/types/index.ts b/src/types/index.ts index 2f4333d5..6a22fcc7 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -113,17 +113,17 @@ export interface TriggerResult { * Allows the trigger handler to undo side-effects like dedup marking. */ onBlocked?: () => void; /** - * Coalesce key for handling PM provider create→update webhook sequences. + * Coalesce key for PM status-change webhook deduplication. * - * Set on `pm:status-changed` triggers where the event kind matters. When - * `coalesceRole === 'create'`, the router defers dispatch by the - * `PM_CREATE_COALESCE_WINDOW_MS` window; an incoming `'update'` event - * sharing the same key within the window supersedes the create. + * Set on `pm:status-changed` triggers. When present, the router schedules + * the job as a BullMQ delayed job keyed by this value. Any subsequent + * event sharing the same key within the `PM_COALESCE_WINDOW_MS` window + * supersedes the prior pending dispatch — regardless of agent type or + * whether the event is a create vs. update. * * Typical key: `${projectId}:${workItemId}`. */ coalesceKey?: string; - coalesceRole?: 'create' | 'update'; } export interface TriggerHandler { diff --git a/src/worker-entry.ts b/src/worker-entry.ts index 8a0480f9..9525229d 100644 --- a/src/worker-entry.ts +++ b/src/worker-entry.ts @@ -22,6 +22,7 @@ import { registerBuiltInEngines } from './backends/bootstrap.js'; import { loadEnvConfigSafe } from './config/env.js'; import { loadConfig } from './config/provider.js'; import { getDb } from './db/client.js'; +import { dispatchPMAck } from './router/pm-ack-dispatch.js'; import { captureException, flush, setTag } from './sentry.js'; import { createTriggerRegistry, @@ -47,6 +48,10 @@ export interface TrelloJobData { receivedAt: string; ackCommentId?: string; triggerResult?: TriggerResult; + /** When true, the worker must post the ack comment before processing (deferred ack). */ + pendingAck?: boolean; + /** Pre-generated ack message text for deferred ack posting. */ + ackMessage?: string; } export interface GitHubJobData { @@ -71,6 +76,10 @@ export interface JiraJobData { receivedAt: string; ackCommentId?: string; triggerResult?: TriggerResult; + /** When true, the worker must post the ack comment before processing (deferred ack). */ + pendingAck?: boolean; + /** Pre-generated ack message text for deferred ack posting. */ + ackMessage?: string; } export interface SentryJobData { @@ -95,6 +104,10 @@ export interface LinearJobData { receivedAt: string; ackCommentId?: string; triggerResult?: TriggerResult; + /** When true, the worker must post the ack comment before processing (deferred ack). */ + pendingAck?: boolean; + /** Pre-generated ack message text for deferred ack posting. */ + ackMessage?: string; } export interface ManualRunJobData { @@ -181,27 +194,47 @@ export async function processDashboardJob(jobId: string, jobData: DashboardJobDa } } +// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: webhook dispatch pipeline with deferred ack for coalesced PM jobs export async function dispatchJob( jobId: string, jobData: JobData, triggerRegistry: TriggerRegistry, ): Promise { switch (jobData.type) { - case 'trello': + case 'trello': { logger.info('[Worker] Processing Trello job', { jobId, workItemId: jobData.workItemId, actionType: jobData.actionType, ackCommentId: jobData.ackCommentId, + pendingAck: jobData.pendingAck, hasTriggerResult: !!jobData.triggerResult, }); + // Deferred ack: post the ack comment that was skipped at schedule time. + let trelloAckCommentId = jobData.ackCommentId; + if (jobData.pendingAck) { + const ackResult = await dispatchPMAck({ + projectId: jobData.projectId, + workItemId: jobData.workItemId, + pmType: 'trello', + message: jobData.ackMessage ?? '✍️ On it', + agentType: jobData.triggerResult?.agentType ?? undefined, + }).catch((err) => { + logger.warn('[Worker] Deferred Trello ack failed (non-fatal)', { error: String(err) }); + return undefined; + }); + if (ackResult?.commentId != null) { + trelloAckCommentId = String(ackResult.commentId); + } + } await processTrelloWebhook( jobData.payload, triggerRegistry, - jobData.ackCommentId, + trelloAckCommentId, jobData.triggerResult, ); break; + } case 'github': logger.info('[Worker] Processing GitHub job', { jobId, @@ -219,21 +252,40 @@ export async function dispatchJob( jobData.triggerResult, ); break; - case 'jira': + case 'jira': { logger.info('[Worker] Processing JIRA job', { jobId, issueKey: jobData.issueKey, webhookEvent: jobData.webhookEvent, ackCommentId: jobData.ackCommentId, + pendingAck: jobData.pendingAck, hasTriggerResult: !!jobData.triggerResult, }); + // Deferred ack: post the ack comment that was skipped at schedule time. + let jiraAckCommentId = jobData.ackCommentId; + if (jobData.pendingAck) { + const ackResult = await dispatchPMAck({ + projectId: jobData.projectId, + workItemId: jobData.issueKey, + pmType: 'jira', + message: jobData.ackMessage ?? '✍️ On it', + agentType: jobData.triggerResult?.agentType ?? undefined, + }).catch((err) => { + logger.warn('[Worker] Deferred JIRA ack failed (non-fatal)', { error: String(err) }); + return undefined; + }); + if (ackResult?.commentId != null) { + jiraAckCommentId = String(ackResult.commentId); + } + } await processJiraWebhook( jobData.payload, triggerRegistry, - jobData.ackCommentId, + jiraAckCommentId, jobData.triggerResult, ); break; + } case 'sentry': logger.info('[Worker] Processing Sentry job', { jobId, @@ -248,22 +300,41 @@ export async function dispatchJob( jobData.triggerResult, ); break; - case 'linear': + case 'linear': { logger.info('[Worker] Processing Linear job', { jobId, projectId: jobData.projectId, workItemId: jobData.workItemId, eventType: jobData.eventType, ackCommentId: jobData.ackCommentId, + pendingAck: jobData.pendingAck, hasTriggerResult: !!jobData.triggerResult, }); + // Deferred ack: post the ack comment that was skipped at schedule time. + let linearAckCommentId = jobData.ackCommentId; + if (jobData.pendingAck && jobData.workItemId) { + const ackResult = await dispatchPMAck({ + projectId: jobData.projectId, + workItemId: jobData.workItemId, + pmType: 'linear', + message: jobData.ackMessage ?? '✍️ On it', + agentType: jobData.triggerResult?.agentType ?? undefined, + }).catch((err) => { + logger.warn('[Worker] Deferred Linear ack failed (non-fatal)', { error: String(err) }); + return undefined; + }); + if (ackResult?.commentId != null) { + linearAckCommentId = String(ackResult.commentId); + } + } await processLinearWebhook( jobData.payload, triggerRegistry, - jobData.ackCommentId, + linearAckCommentId, jobData.triggerResult, ); break; + } case 'manual-run': case 'retry-run': case 'debug-analysis': diff --git a/tests/integration/coalesce-bullmq.test.ts b/tests/integration/coalesce-bullmq.test.ts new file mode 100644 index 00000000..2a176e2d --- /dev/null +++ b/tests/integration/coalesce-bullmq.test.ts @@ -0,0 +1,145 @@ +/** + * Integration test for BullMQ delayed-job coalescing (spec — PM coalesce). + * + * Tests that `scheduleCoalescedJob` correctly supersedes prior pending + * delayed jobs in a real BullMQ Queue backed by a real Redis instance. + * + * These tests require a running Redis server. They use a dedicated test + * queue name to avoid interfering with the production cascade-jobs queue. + */ + +import { Queue } from 'bullmq'; +import { afterAll, afterEach, beforeAll, describe, expect, it } from 'vitest'; +import { parseRedisUrl } from '../../src/utils/redis.js'; + +// --------------------------------------------------------------------------- +// Test queue — isolated from the production 'cascade-jobs' queue. +// --------------------------------------------------------------------------- + +const TEST_QUEUE_NAME = 'cascade-test-coalesce'; +const connection = parseRedisUrl(process.env.REDIS_URL ?? 'redis://localhost:6379'); +let testQueue: Queue; + +beforeAll(async () => { + testQueue = new Queue(TEST_QUEUE_NAME, { connection }); + // Drain any stale jobs from a previous test run. + await testQueue.drain(); + await testQueue.clean(0, 100, 'delayed'); + await testQueue.clean(0, 100, 'wait'); + await testQueue.clean(0, 100, 'completed'); + await testQueue.clean(0, 100, 'failed'); +}); + +afterEach(async () => { + // Clean up between test cases. + await testQueue.drain(); + await testQueue.clean(0, 100, 'delayed'); + await testQueue.clean(0, 100, 'wait'); +}); + +afterAll(async () => { + await testQueue.close(); +}); + +// --------------------------------------------------------------------------- +// Local version of scheduleCoalescedJob that targets the test queue. +// --------------------------------------------------------------------------- + +async function scheduleOnTestQueue( + jobData: Record, + coalesceKey: string, + delayMs: number, +): Promise<{ jobId: string; superseded: boolean }> { + const jobId = `coalesce:${coalesceKey}`; + let superseded = false; + + const existing = await testQueue.getJob(jobId); + if (existing) { + const state = await existing.getState(); + if (state === 'delayed' || state === 'waiting') { + await existing.remove(); + superseded = true; + } + } + + await testQueue.add('test', jobData, { jobId, delay: delayMs }); + return { jobId, superseded }; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('scheduleCoalescedJob — real BullMQ delayed-job supersede', () => { + it('schedules a new delayed job when none exists', async () => { + const { jobId, superseded } = await scheduleOnTestQueue( + { type: 'jira', issueKey: 'PROJ-1' }, + 'test-project:PROJ-1', + 60_000, // 1-minute delay so the job doesn't fire during the test + ); + + expect(jobId).toBe('coalesce:test-project:PROJ-1'); + expect(superseded).toBe(false); + + const job = await testQueue.getJob(jobId); + expect(job).not.toBeNull(); + const state = await job?.getState(); + expect(state).toBe('delayed'); + }); + + it('supersedes a prior delayed job with the same coalesceKey', async () => { + // First dispatch (create event). + const first = await scheduleOnTestQueue( + { type: 'jira', issueKey: 'PROJ-2', agentType: 'implementation' }, + 'test-project:PROJ-2', + 60_000, + ); + expect(first.superseded).toBe(false); + + // Second dispatch (update event — same key, should supersede first). + const second = await scheduleOnTestQueue( + { type: 'jira', issueKey: 'PROJ-2', agentType: 'planning' }, + 'test-project:PROJ-2', + 60_000, + ); + expect(second.superseded).toBe(true); + expect(second.jobId).toBe('coalesce:test-project:PROJ-2'); + + // Only one delayed job should exist; its data should be the latest. + const job = await testQueue.getJob('coalesce:test-project:PROJ-2'); + expect(job).not.toBeNull(); + expect((job?.data as { agentType?: string }).agentType).toBe('planning'); + }); + + it('different coalesceKeys do not interfere with each other', async () => { + const resultA = await scheduleOnTestQueue( + { type: 'jira', issueKey: 'PROJ-3' }, + 'project-a:PROJ-3', + 60_000, + ); + const resultB = await scheduleOnTestQueue( + { type: 'jira', issueKey: 'PROJ-4' }, + 'project-b:PROJ-4', + 60_000, + ); + + expect(resultA.superseded).toBe(false); + expect(resultB.superseded).toBe(false); + + // Both jobs should exist independently. + const jobA = await testQueue.getJob('coalesce:project-a:PROJ-3'); + const jobB = await testQueue.getJob('coalesce:project-b:PROJ-4'); + expect(jobA).not.toBeNull(); + expect(jobB).not.toBeNull(); + }); + + it('triple supersede: last writer wins', async () => { + await scheduleOnTestQueue({ agentType: 'splitting' }, 'proj:TRIPLE', 60_000); + await scheduleOnTestQueue({ agentType: 'planning' }, 'proj:TRIPLE', 60_000); + const third = await scheduleOnTestQueue({ agentType: 'implementation' }, 'proj:TRIPLE', 60_000); + + expect(third.superseded).toBe(true); + const job = await testQueue.getJob('coalesce:proj:TRIPLE'); + expect((job?.data as { agentType?: string }).agentType).toBe('implementation'); + }); +}); diff --git a/tests/unit/pm/create-coalesce-window.test.ts b/tests/unit/pm/create-coalesce-window.test.ts deleted file mode 100644 index ce5c098e..00000000 --- a/tests/unit/pm/create-coalesce-window.test.ts +++ /dev/null @@ -1,77 +0,0 @@ -import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; - -import { - __resetCoalesceWindowForTests, - clearPendingCreate, - registerPendingCreate, -} from '../../../src/pm/create-coalesce-window.js'; - -describe('create-coalesce-window', () => { - beforeEach(() => { - vi.useFakeTimers(); - __resetCoalesceWindowForTests(); - }); - - afterEach(() => { - vi.useRealTimers(); - }); - - it('resolves to "proceed" after the window elapses with no follow-up', async () => { - const promise = registerPendingCreate('proj-1:ITEM-1', 2000); - await vi.advanceTimersByTimeAsync(2000); - await expect(promise).resolves.toBe('proceed'); - }); - - it('resolves to "superseded" when clearPendingCreate is called within the window', async () => { - const promise = registerPendingCreate('proj-1:ITEM-1', 2000); - await vi.advanceTimersByTimeAsync(500); - clearPendingCreate('proj-1:ITEM-1'); - await expect(promise).resolves.toBe('superseded'); - }); - - it('honors the window duration — does not resolve early', async () => { - const promise = registerPendingCreate('proj-1:ITEM-1', 2000); - const outcome: Array<'proceed' | 'superseded'> = []; - promise.then((v) => outcome.push(v)); - - await vi.advanceTimersByTimeAsync(1999); - expect(outcome).toEqual([]); - - await vi.advanceTimersByTimeAsync(1); - await promise; - expect(outcome).toEqual(['proceed']); - }); - - it('keys are isolated — clearing one key does not affect another', async () => { - const p1 = registerPendingCreate('proj-1:ITEM-1', 2000); - const p2 = registerPendingCreate('proj-1:ITEM-2', 2000); - - clearPendingCreate('proj-1:ITEM-1'); - expect(await p1).toBe('superseded'); - - await vi.advanceTimersByTimeAsync(2000); - expect(await p2).toBe('proceed'); - }); - - it('registering a second create for the same key supersedes the first', async () => { - const first = registerPendingCreate('proj-1:ITEM-1', 2000); - const second = registerPendingCreate('proj-1:ITEM-1', 2000); - - expect(await first).toBe('superseded'); - - await vi.advanceTimersByTimeAsync(2000); - expect(await second).toBe('proceed'); - }); - - it('ttlMs of 0 resolves immediately to "proceed" without registering state', async () => { - const promise = registerPendingCreate('proj-1:ITEM-1', 0); - // No timer advance needed; microtask flush. - await expect(promise).resolves.toBe('proceed'); - // A subsequent clear is a no-op (no pending entry). - expect(() => clearPendingCreate('proj-1:ITEM-1')).not.toThrow(); - }); - - it('clearPendingCreate on unknown key is a no-op', () => { - expect(() => clearPendingCreate('never-registered')).not.toThrow(); - }); -}); diff --git a/tests/unit/router/queue.test.ts b/tests/unit/router/queue.test.ts new file mode 100644 index 00000000..96b99912 --- /dev/null +++ b/tests/unit/router/queue.test.ts @@ -0,0 +1,124 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +// --------------------------------------------------------------------------- +// Mock BullMQ + redis utils so the module can be imported without a real Redis. +// vi.hoisted() runs before vi.mock() factories so mock instances are available +// inside factory closures. +// --------------------------------------------------------------------------- + +const { mockJobInstance, mockQueueInstance } = vi.hoisted(() => { + const mockJobInstance = { + getState: vi.fn(), + remove: vi.fn(), + }; + const mockQueueInstance = { + on: vi.fn(), + add: vi.fn().mockResolvedValue({ id: 'test-job-id' }), + getJob: vi.fn().mockResolvedValue(null), + getWaitingCount: vi.fn().mockResolvedValue(0), + getActiveCount: vi.fn().mockResolvedValue(0), + getCompletedCount: vi.fn().mockResolvedValue(0), + getFailedCount: vi.fn().mockResolvedValue(0), + }; + return { mockJobInstance, mockQueueInstance }; +}); + +vi.mock('bullmq', () => ({ + Queue: vi.fn().mockImplementation(() => mockQueueInstance), +})); + +vi.mock('../../../src/utils/redis.js', () => ({ + parseRedisUrl: vi.fn().mockReturnValue({}), +})); + +vi.mock('../../../src/router/config.js', () => ({ + routerConfig: { redisUrl: 'redis://localhost:6379' }, +})); + +vi.mock('../../../src/utils/logging.js', () => ({ + logger: { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }, +})); + +vi.mock('../../../src/sentry.js', () => ({ + captureException: vi.fn(), +})); + +import type { CascadeJob } from '../../../src/router/queue.js'; +import { scheduleCoalescedJob } from '../../../src/router/queue.js'; + +const sampleJob: CascadeJob = { + type: 'jira', + source: 'jira', + payload: {}, + projectId: 'proj-1', + issueKey: 'PROJ-42', + webhookEvent: 'jira:issue_created', + receivedAt: new Date().toISOString(), +}; + +describe('scheduleCoalescedJob', () => { + beforeEach(() => { + mockQueueInstance.getJob.mockResolvedValue(null); + mockQueueInstance.add.mockResolvedValue({ id: 'coalesce:proj-1:PROJ-42' }); + mockJobInstance.getState.mockReset(); + mockJobInstance.remove.mockReset(); + }); + + it('schedules a new delayed job when no existing job exists', async () => { + mockQueueInstance.getJob.mockResolvedValue(null); + + const result = await scheduleCoalescedJob(sampleJob, 'proj-1:PROJ-42', 10_000); + + expect(result.jobId).toBe('coalesce:proj-1:PROJ-42'); + expect(result.superseded).toBe(false); + expect(mockQueueInstance.add).toHaveBeenCalledWith( + 'jira', + sampleJob, + expect.objectContaining({ jobId: 'coalesce:proj-1:PROJ-42', delay: 10_000 }), + ); + }); + + it('removes existing delayed job and returns superseded=true', async () => { + mockJobInstance.getState.mockResolvedValue('delayed'); + mockJobInstance.remove.mockResolvedValue(undefined); + mockQueueInstance.getJob.mockResolvedValue(mockJobInstance); + + const result = await scheduleCoalescedJob(sampleJob, 'proj-1:PROJ-42', 10_000); + + expect(result.superseded).toBe(true); + expect(mockJobInstance.remove).toHaveBeenCalledOnce(); + expect(mockQueueInstance.add).toHaveBeenCalledWith( + 'jira', + sampleJob, + expect.objectContaining({ jobId: 'coalesce:proj-1:PROJ-42', delay: 10_000 }), + ); + }); + + it('does not remove an active (running) job and returns superseded=false', async () => { + mockJobInstance.getState.mockResolvedValue('active'); + mockJobInstance.remove.mockResolvedValue(undefined); + mockQueueInstance.getJob.mockResolvedValue(mockJobInstance); + + const result = await scheduleCoalescedJob(sampleJob, 'proj-1:PROJ-42', 10_000); + + expect(result.superseded).toBe(false); + expect(mockJobInstance.remove).not.toHaveBeenCalled(); + // Still adds the new job even if an active job exists with same ID + expect(mockQueueInstance.add).toHaveBeenCalled(); + }); + + it('uses the coalesceKey to derive the BullMQ job ID', async () => { + const result = await scheduleCoalescedJob(sampleJob, 'my-project:ISSUE-99', 5_000); + expect(result.jobId).toBe('coalesce:my-project:ISSUE-99'); + expect(mockQueueInstance.add).toHaveBeenCalledWith( + expect.any(String), + expect.anything(), + expect.objectContaining({ jobId: 'coalesce:my-project:ISSUE-99' }), + ); + }); +}); diff --git a/tests/unit/router/webhook-processor.test.ts b/tests/unit/router/webhook-processor.test.ts index 3c7e4701..1a21624b 100644 --- a/tests/unit/router/webhook-processor.test.ts +++ b/tests/unit/router/webhook-processor.test.ts @@ -1,4 +1,4 @@ -import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; vi.mock('../../../src/utils/logging.js', () => ({ logger: { @@ -10,6 +10,10 @@ vi.mock('../../../src/utils/logging.js', () => ({ })); vi.mock('../../../src/router/queue.js', () => ({ addJob: vi.fn(), + scheduleCoalescedJob: vi.fn().mockResolvedValue({ jobId: 'coalesce:key', superseded: false }), +})); +vi.mock('../../../src/pm/coalesce-config.js', () => ({ + getCoalesceWindowMs: vi.fn().mockReturnValue(10_000), })); vi.mock('../../../src/router/work-item-lock.js', () => ({ isWorkItemLocked: vi.fn().mockResolvedValue({ locked: false }), @@ -31,13 +35,18 @@ vi.mock('../../../src/sentry.js', () => ({ captureException: vi.fn(), })); +import { getCoalesceWindowMs } from '../../../src/pm/coalesce-config.js'; import { isDuplicateAction, markActionProcessed } from '../../../src/router/action-dedup.js'; -import { checkAgentTypeConcurrency } from '../../../src/router/agent-type-lock.js'; +import { + checkAgentTypeConcurrency, + markAgentTypeEnqueued, + markRecentlyDispatched, +} from '../../../src/router/agent-type-lock.js'; import type { RouterProjectConfig } from '../../../src/router/config.js'; import { classifyLockState } from '../../../src/router/lock-state-classifier.js'; import type { RouterPlatformAdapter } from '../../../src/router/platform-adapter.js'; import type { CascadeJob } from '../../../src/router/queue.js'; -import { addJob } from '../../../src/router/queue.js'; +import { addJob, scheduleCoalescedJob } from '../../../src/router/queue.js'; import { processRouterWebhook } from '../../../src/router/webhook-processor.js'; import { isWorkItemLocked, markWorkItemEnqueued } from '../../../src/router/work-item-lock.js'; import { captureException } from '../../../src/sentry.js'; @@ -555,77 +564,180 @@ describe('processRouterWebhook', () => { expect(markWorkItemEnqueued).not.toHaveBeenCalled(); }); - describe('create→update coalesce', () => { - // Use real timers + short window so the create resolves quickly when not superseded. - const origWindow = process.env.PM_CREATE_COALESCE_WINDOW_MS; - + describe('BullMQ delayed-job coalescing', () => { beforeEach(() => { - process.env.PM_CREATE_COALESCE_WINDOW_MS = '50'; - }); - afterEach(() => { - if (origWindow === undefined) delete process.env.PM_CREATE_COALESCE_WINDOW_MS; - else process.env.PM_CREATE_COALESCE_WINDOW_MS = origWindow; + vi.mocked(scheduleCoalescedJob).mockResolvedValue({ + jobId: 'coalesce:p1:PROJ-1', + superseded: false, + }); + vi.mocked(getCoalesceWindowMs).mockReturnValue(10_000); }); - it('supersedes a create when an update with the same coalesceKey arrives within the window', async () => { - vi.mocked(addJob).mockResolvedValue('job-x'); - const createAdapter = makeMockAdapter({ + it('schedules a coalesced delayed job when coalesceKey is present', async () => { + const adapter = makeMockAdapter({ type: 'jira', dispatchWithCredentials: vi.fn().mockResolvedValue({ agentType: 'implementation', agentInput: { workItemId: 'PROJ-1' }, workItemId: 'PROJ-1', coalesceKey: 'p1:PROJ-1', - coalesceRole: 'create', }), }); - const updateAdapter = makeMockAdapter({ + + const result = await processRouterWebhook(adapter, {}, mockTriggerRegistry); + + expect(result.shouldProcess).toBe(true); + expect(result.decisionReason).toMatch(/Coalesced dispatch scheduled/); + expect(scheduleCoalescedJob).toHaveBeenCalledOnce(); + // Immediate addJob must NOT be called for coalesced path + expect(addJob).not.toHaveBeenCalled(); + }); + + it('does not post ack immediately for coalesced jobs (deferred to fire time)', async () => { + const adapter = makeMockAdapter({ type: 'jira', dispatchWithCredentials: vi.fn().mockResolvedValue({ - agentType: 'planning', + agentType: 'implementation', agentInput: { workItemId: 'PROJ-1' }, workItemId: 'PROJ-1', coalesceKey: 'p1:PROJ-1', - coalesceRole: 'update', }), }); - // Fire create (will wait 50ms) and update (resolves immediately, supersedes create) - const createPromise = processRouterWebhook(createAdapter, {}, mockTriggerRegistry); - // Let microtasks settle so the create registers before we dispatch the update. - await Promise.resolve(); - const updateResult = await processRouterWebhook(updateAdapter, {}, mockTriggerRegistry); - const createResult = await createPromise; + await processRouterWebhook(adapter, {}, mockTriggerRegistry); + + // postAck must NOT be called at schedule time for coalesced jobs + expect(adapter.postAck).not.toHaveBeenCalled(); + }); + + it('marks work-item lock when coalesced job is scheduled', async () => { + const adapter = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: 'implementation', + agentInput: { workItemId: 'PROJ-1' }, + workItemId: 'PROJ-1', + coalesceKey: 'p1:PROJ-1', + }), + }); - expect(createResult.decisionReason).toBe( - 'Create trigger superseded by follow-up update (coalesce window)', - ); - expect(updateResult.shouldProcess).toBe(true); + await processRouterWebhook(adapter, {}, mockTriggerRegistry); - // Only one job should have been queued (for the update), not two. - expect(addJob).toHaveBeenCalledTimes(1); - expect(createAdapter.postAck).not.toHaveBeenCalled(); - expect(updateAdapter.postAck).toHaveBeenCalled(); + expect(markWorkItemEnqueued).toHaveBeenCalledWith('p1', 'PROJ-1', 'implementation'); + expect(markRecentlyDispatched).toHaveBeenCalled(); + expect(markAgentTypeEnqueued).toHaveBeenCalled(); }); - it('proceeds with a create when no update arrives within the window', async () => { - vi.mocked(addJob).mockResolvedValue('job-solo'); + it('logs supersede when prior delayed job is replaced (UA-21 regression)', async () => { + vi.mocked(scheduleCoalescedJob).mockResolvedValue({ + jobId: 'coalesce:p1:PROJ-1', + superseded: true, + }); + const { logger } = await import('../../../src/utils/logging.js'); const adapter = makeMockAdapter({ type: 'jira', dispatchWithCredentials: vi.fn().mockResolvedValue({ agentType: 'planning', + agentInput: { workItemId: 'PROJ-1' }, + workItemId: 'PROJ-1', + coalesceKey: 'p1:PROJ-1', + }), + }); + + await processRouterWebhook(adapter, {}, mockTriggerRegistry); + + const infoCall = vi + .mocked(logger.info) + .mock.calls.find((c) => String(c[0]).includes('superseded prior pending job')); + expect(infoCall).toBeDefined(); + }); + + it('falls back to normal dispatch when PM_COALESCE_WINDOW_MS=0 (disable)', async () => { + vi.mocked(getCoalesceWindowMs).mockReturnValue(0); + vi.mocked(addJob).mockResolvedValue('job-immediate'); + const adapter = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: 'implementation', agentInput: { workItemId: 'PROJ-2' }, workItemId: 'PROJ-2', coalesceKey: 'p1:PROJ-2', - coalesceRole: 'create', }), }); const result = await processRouterWebhook(adapter, {}, mockTriggerRegistry); - expect(result.shouldProcess).toBe(true); - expect(addJob).toHaveBeenCalledTimes(1); - expect(adapter.postAck).toHaveBeenCalled(); + // Window=0 → normal path: scheduleCoalescedJob not called, addJob called + expect(scheduleCoalescedJob).not.toHaveBeenCalled(); + expect(addJob).toHaveBeenCalled(); + expect(result.decisionReason).toMatch(/Job queued/); + }); + + it('coalesce isolation: different coalesceKeys do not interfere', async () => { + vi.mocked(addJob).mockResolvedValue('job-y'); + const adapterA = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: 'implementation', + agentInput: { workItemId: 'PROJ-10' }, + workItemId: 'PROJ-10', + coalesceKey: 'p1:PROJ-10', + }), + }); + const adapterB = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: 'implementation', + agentInput: { workItemId: 'PROJ-20' }, + workItemId: 'PROJ-20', + coalesceKey: 'p1:PROJ-20', + }), + }); + + await processRouterWebhook(adapterA, {}, mockTriggerRegistry); + await processRouterWebhook(adapterB, {}, mockTriggerRegistry); + + // scheduleCoalescedJob called once per distinct key + expect(scheduleCoalescedJob).toHaveBeenCalledTimes(2); + expect(vi.mocked(scheduleCoalescedJob).mock.calls[0][1]).toBe('p1:PROJ-10'); + expect(vi.mocked(scheduleCoalescedJob).mock.calls[1][1]).toBe('p1:PROJ-20'); + }); + + it('returns error reason when scheduleCoalescedJob throws', async () => { + vi.mocked(scheduleCoalescedJob).mockRejectedValue(new Error('Redis down')); + const onBlocked = vi.fn(); + const adapter = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: 'implementation', + agentInput: { workItemId: 'PROJ-3' }, + workItemId: 'PROJ-3', + coalesceKey: 'p1:PROJ-3', + onBlocked, + }), + }); + + const result = await processRouterWebhook(adapter, {}, mockTriggerRegistry); + + expect(result.decisionReason).toBe('Failed to schedule coalesced job to Redis'); + expect(onBlocked).toHaveBeenCalledOnce(); + expect(addJob).not.toHaveBeenCalled(); + }); + + it('skips coalesce path when no agentType (no-agent triggers)', async () => { + const adapter = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: null, + agentInput: {}, + coalesceKey: 'p1:PROJ-99', + }), + }); + + const result = await processRouterWebhook(adapter, {}, mockTriggerRegistry); + + expect(scheduleCoalescedJob).not.toHaveBeenCalled(); + expect(result.decisionReason).toBe('Trigger completed without agent (PM operation)'); }); }); }); diff --git a/tests/unit/triggers/jira-status-changed.test.ts b/tests/unit/triggers/jira-status-changed.test.ts index 7fca6c4a..e46a00f1 100644 --- a/tests/unit/triggers/jira-status-changed.test.ts +++ b/tests/unit/triggers/jira-status-changed.test.ts @@ -338,17 +338,17 @@ describe('JiraStatusChangedTrigger', () => { }); describe('coalesce metadata', () => { - it('tags move results with coalesceRole: "update" and a project-scoped key', async () => { + it('tags move results with a project-scoped coalesceKey (no coalesceRole)', async () => { const ctx = buildCtx({ statusChangeItems: [{ field: 'status', fromString: 'Backlog', toString: 'Splitting' }], }); const result = await trigger.handle(ctx); expect(result?.coalesceKey).toBe('test-project:PROJ-42'); - expect(result?.coalesceRole).toBe('update'); + expect(result).not.toHaveProperty('coalesceRole'); }); - it('tags create results with coalesceRole: "create" and a project-scoped key', async () => { + it('tags create results with a project-scoped coalesceKey (no coalesceRole)', async () => { mockTriggerConfig(true, { onCreate: true, onMove: true }); const ctx = buildCtx({ webhookEvent: 'jira:issue_created', @@ -357,7 +357,7 @@ describe('JiraStatusChangedTrigger', () => { const result = await trigger.handle(ctx); expect(result?.coalesceKey).toBe('test-project:PROJ-42'); - expect(result?.coalesceRole).toBe('create'); + expect(result).not.toHaveProperty('coalesceRole'); }); }); }); diff --git a/tests/unit/triggers/linear-status-changed.test.ts b/tests/unit/triggers/linear-status-changed.test.ts index 6393b5ed..f1833300 100644 --- a/tests/unit/triggers/linear-status-changed.test.ts +++ b/tests/unit/triggers/linear-status-changed.test.ts @@ -352,19 +352,19 @@ describe('LinearStatusChangedTrigger', () => { }); describe('coalesce metadata', () => { - it('tags move results with coalesceRole: "update" and a project-scoped key', async () => { + it('tags move results with a project-scoped coalesceKey (no coalesceRole)', async () => { const result = await trigger.handle(buildCtx({ newStateId: 'state-todo' })); expect(result?.coalesceKey).toBe('proj-linear:TEAM-123'); - expect(result?.coalesceRole).toBe('update'); + expect(result).not.toHaveProperty('coalesceRole'); }); - it('tags create results with coalesceRole: "create" and a project-scoped key', async () => { + it('tags create results with a project-scoped coalesceKey (no coalesceRole)', async () => { mockTriggerConfig(true, { onCreate: true, onMove: true }); const result = await trigger.handle( buildCtx({ action: 'create', newStateId: 'state-todo', noUpdatedFrom: true }), ); expect(result?.coalesceKey).toBe('proj-linear:TEAM-123'); - expect(result?.coalesceRole).toBe('create'); + expect(result).not.toHaveProperty('coalesceRole'); }); }); }); From 73e7e91b439c3d466ac67837b6140e0a2ac3033f Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Wed, 29 Apr 2026 12:06:29 +0000 Subject: [PATCH 16/22] ci: add Redis service to integration-tests job The new tests/integration/coalesce-bullmq.test.ts requires a live Redis instance to verify BullMQ delayed-job supersede semantics, but the CI integration-tests job only provisioned Postgres. Add a redis:7-alpine service on the standard 6379 port and pass REDIS_URL through to the test step so the BullMQ Queue can connect. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d648a495..a636b151 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,6 +92,16 @@ jobs: --health-timeout 5s --health-retries 10 + redis: + image: redis:7-alpine + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 2s + --health-timeout 5s + --health-retries 10 + steps: - uses: actions/checkout@v4 @@ -111,6 +121,7 @@ jobs: run: npm run test:integration env: TEST_DATABASE_URL: postgresql://cascade_test:cascade_test@localhost:5433/cascade_test + REDIS_URL: redis://localhost:6379 docker-build-check: name: Validate Docker builds From 8f9e187cdb4b51ac4d078106843761c37cdc9e1d Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Wed, 29 Apr 2026 12:30:10 +0000 Subject: [PATCH 17/22] fix(router): address review feedback on PM coalesce deferred ack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - worker-entry: call generateAckMessage() at job fire time instead of using workItemTitle as the literal comment text. Adds postDeferredAck() shared helper that extracts context from the stored payload (same path as the non-coalesced adapter.postAck), generates a proper role-aware ack message via LLM/fallback, then calls dispatchPMAck. Eliminates the 3x copy-paste deferred ack blocks (nitpick fix). - queue.ts: document the TOCTOU race in scheduleCoalescedJob (getJob → getState → remove → add is not atomic; two concurrent handlers can lose the second event's data). Practical impact is low but documented for future maintainers. - webhook-processor.ts + types: update ackMessage field docstrings to clarify it stores the workItemTitle as a context hint for generateAckMessage at fire time, not a pre-generated comment text. Co-Authored-By: Claude Sonnet 4.6 --- src/router/queue.ts | 16 +++- src/router/webhook-processor.ts | 9 ++- src/worker-entry.ts | 127 +++++++++++++++++++++----------- 3 files changed, 103 insertions(+), 49 deletions(-) diff --git a/src/router/queue.ts b/src/router/queue.ts index 0742e55e..3aade005 100644 --- a/src/router/queue.ts +++ b/src/router/queue.ts @@ -23,7 +23,7 @@ export interface TrelloJob { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** Pre-generated ack message text for deferred ack posting. */ + /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ ackMessage?: string; } @@ -51,7 +51,7 @@ export interface JiraJob { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** Pre-generated ack message text for deferred ack posting. */ + /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ ackMessage?: string; } @@ -78,7 +78,7 @@ export interface LinearJob { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** Pre-generated ack message text for deferred ack posting. */ + /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ ackMessage?: string; } @@ -149,6 +149,16 @@ export async function scheduleCoalescedJob( // Remove any existing delayed/waiting job with the same key so the new // job supersedes it. Active jobs are left alone — they are already running. + // + // TOCTOU NOTE: The getJob → getState → remove → add sequence is not atomic. + // Two concurrent webhook handlers for the same coalesceKey can both read the + // existing delayed job, both attempt remove() (the second no-ops silently), + // and then both call add() — but BullMQ silently ignores a duplicate jobId + // for a non-completed job, so the second event's data is lost. In practice + // this race is rare: the coalesce window exists for events tens-to-hundreds + // of milliseconds apart, not truly simultaneous arrivals. A Lua-script + // atomic compare-and-replace would close this, but the operational impact is + // low enough that a documented best-effort approach is acceptable here. const existing = await jobQueue.getJob(jobId); if (existing) { const state = await existing.getState(); diff --git a/src/router/webhook-processor.ts b/src/router/webhook-processor.ts index 9094ca2d..f00c4bcf 100644 --- a/src/router/webhook-processor.ts +++ b/src/router/webhook-processor.ts @@ -151,12 +151,13 @@ export async function processRouterWebhook( // Build the job without ack info (ack will be posted at fire time). const job = adapter.buildJob(event, payload, project, result, undefined); - // Attach the deferred-ack marker and a pre-generated message so the - // worker does not need to re-derive the context. + // Attach the deferred-ack marker. Store workItemTitle as a context + // hint (not a literal comment) — the worker calls generateAckMessage() + // at fire time to produce a proper role-aware ack message. Storing + // the title lets generateAckMessage fall back gracefully when the + // full payload context extractor returns nothing. if (job.type === 'trello' || job.type === 'jira' || job.type === 'linear') { job.pendingAck = true; - // Use workItemTitle as context if available; generateAckMessage - // already has its own LLM-backed fallback path via the adapter. job.ackMessage = result.workItemTitle ?? undefined; } diff --git a/src/worker-entry.ts b/src/worker-entry.ts index 9525229d..5d53f9f4 100644 --- a/src/worker-entry.ts +++ b/src/worker-entry.ts @@ -22,6 +22,12 @@ import { registerBuiltInEngines } from './backends/bootstrap.js'; import { loadEnvConfigSafe } from './config/env.js'; import { loadConfig } from './config/provider.js'; import { getDb } from './db/client.js'; +import { + extractJiraContext, + extractLinearContext, + extractTrelloContext, + generateAckMessage, +} from './router/ackMessageGenerator.js'; import { dispatchPMAck } from './router/pm-ack-dispatch.js'; import { captureException, flush, setTag } from './sentry.js'; import { @@ -50,7 +56,7 @@ export interface TrelloJobData { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** Pre-generated ack message text for deferred ack posting. */ + /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ ackMessage?: string; } @@ -78,7 +84,7 @@ export interface JiraJobData { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** Pre-generated ack message text for deferred ack posting. */ + /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ ackMessage?: string; } @@ -106,7 +112,7 @@ export interface LinearJobData { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** Pre-generated ack message text for deferred ack posting. */ + /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ ackMessage?: string; } @@ -194,6 +200,55 @@ export async function processDashboardJob(jobId: string, jobData: DashboardJobDa } } +/** + * Post the deferred acknowledgment comment for a coalesced PM job. + * + * Called at job fire time when `pendingAck=true`. Extracts a context snippet + * from the stored webhook payload, calls `generateAckMessage()` to produce a + * proper role-aware message (same path as the non-coalesced `postAck`), then + * posts it via `dispatchPMAck`. Returns the new comment ID string, or + * `undefined` if the ack could not be posted (non-fatal). + * + * The stored `ackMessage` field contains the `workItemTitle` as a context hint + * fallback when payload extraction returns nothing. + */ +async function postDeferredAck( + projectId: string, + workItemId: string, + pmType: 'trello' | 'jira' | 'linear', + payload: unknown, + agentType: string | undefined, + contextHint: string | undefined, +): Promise { + // Extract context from the raw payload (same source as the non-coalesced postAck path). + let contextSnippet = + pmType === 'jira' + ? extractJiraContext(payload) + : pmType === 'linear' + ? extractLinearContext(payload) + : extractTrelloContext(payload); + + // Fall back to the stored workItemTitle hint when the extractor yields nothing. + if (!contextSnippet && contextHint) { + contextSnippet = `Issue: ${contextHint}`; + } + + const message = await generateAckMessage(agentType ?? '', contextSnippet, projectId); + + const ackResult = await dispatchPMAck({ + projectId, + workItemId, + pmType, + message, + agentType, + }).catch((err) => { + logger.warn(`[Worker] Deferred ${pmType} ack failed (non-fatal)`, { error: String(err) }); + return undefined; + }); + + return ackResult?.commentId != null ? String(ackResult.commentId) : undefined; +} + // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: webhook dispatch pipeline with deferred ack for coalesced PM jobs export async function dispatchJob( jobId: string, @@ -213,19 +268,15 @@ export async function dispatchJob( // Deferred ack: post the ack comment that was skipped at schedule time. let trelloAckCommentId = jobData.ackCommentId; if (jobData.pendingAck) { - const ackResult = await dispatchPMAck({ - projectId: jobData.projectId, - workItemId: jobData.workItemId, - pmType: 'trello', - message: jobData.ackMessage ?? '✍️ On it', - agentType: jobData.triggerResult?.agentType ?? undefined, - }).catch((err) => { - logger.warn('[Worker] Deferred Trello ack failed (non-fatal)', { error: String(err) }); - return undefined; - }); - if (ackResult?.commentId != null) { - trelloAckCommentId = String(ackResult.commentId); - } + trelloAckCommentId = + (await postDeferredAck( + jobData.projectId, + jobData.workItemId, + 'trello', + jobData.payload, + jobData.triggerResult?.agentType ?? undefined, + jobData.ackMessage, + )) ?? trelloAckCommentId; } await processTrelloWebhook( jobData.payload, @@ -264,19 +315,15 @@ export async function dispatchJob( // Deferred ack: post the ack comment that was skipped at schedule time. let jiraAckCommentId = jobData.ackCommentId; if (jobData.pendingAck) { - const ackResult = await dispatchPMAck({ - projectId: jobData.projectId, - workItemId: jobData.issueKey, - pmType: 'jira', - message: jobData.ackMessage ?? '✍️ On it', - agentType: jobData.triggerResult?.agentType ?? undefined, - }).catch((err) => { - logger.warn('[Worker] Deferred JIRA ack failed (non-fatal)', { error: String(err) }); - return undefined; - }); - if (ackResult?.commentId != null) { - jiraAckCommentId = String(ackResult.commentId); - } + jiraAckCommentId = + (await postDeferredAck( + jobData.projectId, + jobData.issueKey, + 'jira', + jobData.payload, + jobData.triggerResult?.agentType ?? undefined, + jobData.ackMessage, + )) ?? jiraAckCommentId; } await processJiraWebhook( jobData.payload, @@ -313,19 +360,15 @@ export async function dispatchJob( // Deferred ack: post the ack comment that was skipped at schedule time. let linearAckCommentId = jobData.ackCommentId; if (jobData.pendingAck && jobData.workItemId) { - const ackResult = await dispatchPMAck({ - projectId: jobData.projectId, - workItemId: jobData.workItemId, - pmType: 'linear', - message: jobData.ackMessage ?? '✍️ On it', - agentType: jobData.triggerResult?.agentType ?? undefined, - }).catch((err) => { - logger.warn('[Worker] Deferred Linear ack failed (non-fatal)', { error: String(err) }); - return undefined; - }); - if (ackResult?.commentId != null) { - linearAckCommentId = String(ackResult.commentId); - } + linearAckCommentId = + (await postDeferredAck( + jobData.projectId, + jobData.workItemId, + 'linear', + jobData.payload, + jobData.triggerResult?.agentType ?? undefined, + jobData.ackMessage, + )) ?? linearAckCommentId; } await processLinearWebhook( jobData.payload, From 67042748297b918460a39aa7a3ea35a45d2f1e23 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Wed, 29 Apr 2026 12:30:47 +0000 Subject: [PATCH 18/22] chore(worker): remove now-unused biome complexity suppression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The noExcessiveCognitiveComplexity suppression on dispatchJob is no longer needed — extracting postDeferredAck() reduced the function's complexity below the threshold. Co-Authored-By: Claude Sonnet 4.6 --- src/worker-entry.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/worker-entry.ts b/src/worker-entry.ts index 5d53f9f4..3533388f 100644 --- a/src/worker-entry.ts +++ b/src/worker-entry.ts @@ -249,7 +249,6 @@ async function postDeferredAck( return ackResult?.commentId != null ? String(ackResult.commentId) : undefined; } -// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: webhook dispatch pipeline with deferred ack for coalesced PM jobs export async function dispatchJob( jobId: string, jobData: JobData, From 22cde8f6982f5f515673aae6c0c5ac937f820b61 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Wed, 29 Apr 2026 12:52:43 +0000 Subject: [PATCH 19/22] fix(router): address lock-leak and active-job-status issues in coalesced dispatch Two lock-lifecycle bugs in BullMQ delayed-job coalescing: 1. Lock leak on supersede: when scheduleCoalescedJob removes a delayed job (returns superseded=true), the superseded job's in-memory locks are never released because BullMQ's remove() does not fire worker.on('failed'). Fix: return supersededJobData from scheduleCoalescedJob so the caller can call clearWorkItemEnqueued / clearAgentTypeEnqueued / clearRecentlyDispatched for the old job. 2. Misleading status when active job exists: when getState() returns 'active', the previous code called add() (BullMQ silently ignores it for duplicate active IDs) and returned superseded=false, causing the caller to mark locks for a non-existent job. Fix: return activeExists=true and skip add() entirely; caller skips lock marking and logs accurately. Co-Authored-By: Claude Sonnet 4.6 --- src/router/queue.ts | 35 +++++++++- src/router/webhook-processor.ts | 48 ++++++++++++- tests/unit/router/queue.test.ts | 25 ++++--- tests/unit/router/webhook-processor.test.ts | 77 ++++++++++++++++++++- 4 files changed, 170 insertions(+), 15 deletions(-) diff --git a/src/router/queue.ts b/src/router/queue.ts index 3aade005..0a0a016d 100644 --- a/src/router/queue.ts +++ b/src/router/queue.ts @@ -125,6 +125,21 @@ export async function addJob(job: CascadeJob): Promise { export interface ScheduleCoalescedJobResult { jobId: string; superseded: boolean; + /** + * Data from the superseded delayed/waiting job. Present when + * `superseded === true`. Used by the caller to release the orphaned + * in-memory locks that were marked for the previous dispatch — those locks + * are never released via `worker.on('failed')` because BullMQ's `remove()` + * does not fire that event. + */ + supersededJobData?: CascadeJob; + /** + * True when a job with the same coalesce ID is already active (running). + * BullMQ silently ignores `add()` for a duplicate active jobId, so we skip + * the `add()` call entirely and return this flag instead. The caller must + * NOT mark new in-memory locks — no new job was created. + */ + activeExists?: boolean; } /** @@ -132,8 +147,8 @@ export interface ScheduleCoalescedJobResult { * * If a delayed/waiting job with the same key already exists it is removed * before the new job is added, superseding the previous dispatch. Active - * (already running) jobs are left untouched; `superseded` is `false` in that - * case. + * (already running) jobs are left untouched and `activeExists` is returned + * as `true` so the caller can skip lock marking. * * This replaces the in-memory `create-coalesce-window.ts` mechanism with a * durable, per-key deduplication that coalesces across any agent types for @@ -146,6 +161,7 @@ export async function scheduleCoalescedJob( ): Promise { const jobId = `coalesce:${coalesceKey}`; let superseded = false; + let supersededJobData: CascadeJob | undefined; // Remove any existing delayed/waiting job with the same key so the new // job supersedes it. Active jobs are left alone — they are already running. @@ -163,14 +179,27 @@ export async function scheduleCoalescedJob( if (existing) { const state = await existing.getState(); if (state === 'delayed' || state === 'waiting') { + // Capture job data before removal so the caller can release orphaned locks. + supersededJobData = existing.data; await existing.remove(); superseded = true; + } else if (state === 'active') { + // An active (running) job already holds this ID. BullMQ would + // silently ignore add() for a duplicate active jobId — no new job + // would be created, but the caller wouldn't know and would mark + // locks incorrectly. Return activeExists=true so the caller can + // log accurately and skip marking new in-memory locks. + logger.info('Coalesced job skipped — active job with same ID already running', { + jobId, + coalesceKey, + }); + return { jobId, superseded: false, activeExists: true }; } } await jobQueue.add(job.type, job, { jobId, delay: delayMs }); logger.info('Coalesced job scheduled', { jobId, coalesceKey, delayMs, superseded }); - return { jobId, superseded }; + return { jobId, superseded, supersededJobData }; } // Get queue stats diff --git a/src/router/webhook-processor.ts b/src/router/webhook-processor.ts index f00c4bcf..5c96c980 100644 --- a/src/router/webhook-processor.ts +++ b/src/router/webhook-processor.ts @@ -17,13 +17,15 @@ import { logger } from '../utils/logging.js'; import { isDuplicateAction, markActionProcessed } from './action-dedup.js'; import { checkAgentTypeConcurrency, + clearAgentTypeEnqueued, + clearRecentlyDispatched, markAgentTypeEnqueued, markRecentlyDispatched, } from './agent-type-lock.js'; import { classifyLockState } from './lock-state-classifier.js'; import type { RouterPlatformAdapter } from './platform-adapter.js'; import { addJob, scheduleCoalescedJob } from './queue.js'; -import { isWorkItemLocked, markWorkItemEnqueued } from './work-item-lock.js'; +import { clearWorkItemEnqueued, isWorkItemLocked, markWorkItemEnqueued } from './work-item-lock.js'; export interface ProcessRouterWebhookResult { /** Whether the event was of a processable type for this platform. */ @@ -164,7 +166,29 @@ export async function processRouterWebhook( // Schedule as a delayed BullMQ job; supersedes any prior pending job // with the same key so only the latest event fires. try { - const { superseded } = await scheduleCoalescedJob(job, result.coalesceKey, windowMs); + const { superseded, supersededJobData, activeExists } = await scheduleCoalescedJob( + job, + result.coalesceKey, + windowMs, + ); + + // When an active job is already running for this coalesceKey, BullMQ + // would silently ignore any new add(). No new job was created, so skip + // lock marking and return an accurate decision reason. + if (activeExists) { + logger.info(`${adapter.type} coalesced dispatch skipped — active job already running`, { + agentType: result.agentType, + workItemId: result.workItemId, + projectId: project.id, + coalesceKey: result.coalesceKey, + }); + return { + shouldProcess: true, + projectId: project.id, + decisionReason: `Coalesced dispatch skipped: active job already running for work item ${result.workItemId ?? '(unknown)'}`, + }; + } + if (superseded) { logger.info(`${adapter.type} coalesced dispatch superseded prior pending job`, { agentType: result.agentType, @@ -172,6 +196,22 @@ export async function processRouterWebhook( projectId: project.id, coalesceKey: result.coalesceKey, }); + // Release in-memory locks for the superseded job to prevent phantom + // lock entries from accumulating. existing.remove() removes the + // delayed BullMQ entry but does NOT fire worker.on('failed'), so + // releaseLocksForFailedJob is never called for the superseded job. + // Manually undo the lock marks from the previous webhook invocation. + if (supersededJobData && supersededJobData.type !== 'github') { + const oldAgentType = supersededJobData.triggerResult?.agentType; + const oldWorkItemId = supersededJobData.triggerResult?.workItemId; + if (oldAgentType) { + if (oldWorkItemId) { + clearWorkItemEnqueued(supersededJobData.projectId, oldWorkItemId, oldAgentType); + } + clearAgentTypeEnqueued(supersededJobData.projectId, oldAgentType); + clearRecentlyDispatched(supersededJobData.projectId, oldAgentType, oldWorkItemId); + } + } } else { logger.info(`${adapter.type} coalesced dispatch scheduled`, { agentType: result.agentType, @@ -195,7 +235,9 @@ export async function processRouterWebhook( }; } - // Mark locks exactly as the non-coalesced path does. + // Mark locks for the newly-scheduled job exactly as the non-coalesced + // path does. (The activeExists early-return above ensures we only reach + // this point when a real new job was added to the queue.) if (result.workItemId) { markWorkItemEnqueued(project.id, result.workItemId, result.agentType); } diff --git a/tests/unit/router/queue.test.ts b/tests/unit/router/queue.test.ts index 96b99912..3a3f0efb 100644 --- a/tests/unit/router/queue.test.ts +++ b/tests/unit/router/queue.test.ts @@ -83,15 +83,22 @@ describe('scheduleCoalescedJob', () => { ); }); - it('removes existing delayed job and returns superseded=true', async () => { - mockJobInstance.getState.mockResolvedValue('delayed'); - mockJobInstance.remove.mockResolvedValue(undefined); - mockQueueInstance.getJob.mockResolvedValue(mockJobInstance); + it('removes existing delayed job and returns superseded=true with supersededJobData', async () => { + const existingData: CascadeJob = { + ...sampleJob, + projectId: 'proj-old', + triggerResult: { agentType: 'planning', workItemId: 'PROJ-42', agentInput: {} }, + }; + const mockJobWithData = { ...mockJobInstance, data: existingData }; + mockJobWithData.getState = vi.fn().mockResolvedValue('delayed'); + mockJobWithData.remove = vi.fn().mockResolvedValue(undefined); + mockQueueInstance.getJob.mockResolvedValue(mockJobWithData); const result = await scheduleCoalescedJob(sampleJob, 'proj-1:PROJ-42', 10_000); expect(result.superseded).toBe(true); - expect(mockJobInstance.remove).toHaveBeenCalledOnce(); + expect(result.supersededJobData).toEqual(existingData); + expect(mockJobWithData.remove).toHaveBeenCalledOnce(); expect(mockQueueInstance.add).toHaveBeenCalledWith( 'jira', sampleJob, @@ -99,7 +106,7 @@ describe('scheduleCoalescedJob', () => { ); }); - it('does not remove an active (running) job and returns superseded=false', async () => { + it('returns activeExists=true and skips add() when an active job has the same ID', async () => { mockJobInstance.getState.mockResolvedValue('active'); mockJobInstance.remove.mockResolvedValue(undefined); mockQueueInstance.getJob.mockResolvedValue(mockJobInstance); @@ -107,9 +114,11 @@ describe('scheduleCoalescedJob', () => { const result = await scheduleCoalescedJob(sampleJob, 'proj-1:PROJ-42', 10_000); expect(result.superseded).toBe(false); + expect(result.activeExists).toBe(true); expect(mockJobInstance.remove).not.toHaveBeenCalled(); - // Still adds the new job even if an active job exists with same ID - expect(mockQueueInstance.add).toHaveBeenCalled(); + // Must NOT add a new job — BullMQ would silently ignore it for active IDs + // and the caller would incorrectly mark locks for a non-existent job. + expect(mockQueueInstance.add).not.toHaveBeenCalled(); }); it('uses the coalesceKey to derive the BullMQ job ID', async () => { diff --git a/tests/unit/router/webhook-processor.test.ts b/tests/unit/router/webhook-processor.test.ts index 1a21624b..8541cd4a 100644 --- a/tests/unit/router/webhook-processor.test.ts +++ b/tests/unit/router/webhook-processor.test.ts @@ -18,11 +18,14 @@ vi.mock('../../../src/pm/coalesce-config.js', () => ({ vi.mock('../../../src/router/work-item-lock.js', () => ({ isWorkItemLocked: vi.fn().mockResolvedValue({ locked: false }), markWorkItemEnqueued: vi.fn(), + clearWorkItemEnqueued: vi.fn(), })); vi.mock('../../../src/router/agent-type-lock.js', () => ({ checkAgentTypeConcurrency: vi.fn().mockResolvedValue({ maxConcurrency: null, blocked: false }), markAgentTypeEnqueued: vi.fn(), markRecentlyDispatched: vi.fn(), + clearAgentTypeEnqueued: vi.fn(), + clearRecentlyDispatched: vi.fn(), })); vi.mock('../../../src/router/action-dedup.js', () => ({ isDuplicateAction: vi.fn().mockReturnValue(false), @@ -39,6 +42,8 @@ import { getCoalesceWindowMs } from '../../../src/pm/coalesce-config.js'; import { isDuplicateAction, markActionProcessed } from '../../../src/router/action-dedup.js'; import { checkAgentTypeConcurrency, + clearAgentTypeEnqueued, + clearRecentlyDispatched, markAgentTypeEnqueued, markRecentlyDispatched, } from '../../../src/router/agent-type-lock.js'; @@ -48,7 +53,11 @@ import type { RouterPlatformAdapter } from '../../../src/router/platform-adapter import type { CascadeJob } from '../../../src/router/queue.js'; import { addJob, scheduleCoalescedJob } from '../../../src/router/queue.js'; import { processRouterWebhook } from '../../../src/router/webhook-processor.js'; -import { isWorkItemLocked, markWorkItemEnqueued } from '../../../src/router/work-item-lock.js'; +import { + clearWorkItemEnqueued, + isWorkItemLocked, + markWorkItemEnqueued, +} from '../../../src/router/work-item-lock.js'; import { captureException } from '../../../src/sentry.js'; import type { TriggerRegistry } from '../../../src/triggers/registry.js'; @@ -652,6 +661,72 @@ describe('processRouterWebhook', () => { expect(infoCall).toBeDefined(); }); + it('releases superseded job locks when supersededJobData is returned', async () => { + const supersededJobData: CascadeJob = { + type: 'jira', + source: 'jira', + payload: {}, + projectId: 'p1', + issueKey: 'PROJ-1', + webhookEvent: 'jira:issue_created', + receivedAt: new Date().toISOString(), + triggerResult: { + agentType: 'splitting', + workItemId: 'PROJ-1', + agentInput: {}, + }, + }; + vi.mocked(scheduleCoalescedJob).mockResolvedValue({ + jobId: 'coalesce:p1:PROJ-1', + superseded: true, + supersededJobData, + }); + const adapter = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: 'planning', + agentInput: { workItemId: 'PROJ-1' }, + workItemId: 'PROJ-1', + coalesceKey: 'p1:PROJ-1', + }), + }); + + await processRouterWebhook(adapter, {}, mockTriggerRegistry); + + // Must clear the superseded job's locks to prevent phantom entries + expect(clearWorkItemEnqueued).toHaveBeenCalledWith('p1', 'PROJ-1', 'splitting'); + expect(clearAgentTypeEnqueued).toHaveBeenCalledWith('p1', 'splitting'); + expect(clearRecentlyDispatched).toHaveBeenCalledWith('p1', 'splitting', 'PROJ-1'); + // Must still mark locks for the new job + expect(markWorkItemEnqueued).toHaveBeenCalled(); + expect(markAgentTypeEnqueued).toHaveBeenCalled(); + }); + + it('skips lock marking when activeExists=true (no new job was created)', async () => { + vi.mocked(scheduleCoalescedJob).mockResolvedValue({ + jobId: 'coalesce:p1:PROJ-1', + superseded: false, + activeExists: true, + }); + const adapter = makeMockAdapter({ + type: 'jira', + dispatchWithCredentials: vi.fn().mockResolvedValue({ + agentType: 'implementation', + agentInput: { workItemId: 'PROJ-1' }, + workItemId: 'PROJ-1', + coalesceKey: 'p1:PROJ-1', + }), + }); + + const result = await processRouterWebhook(adapter, {}, mockTriggerRegistry); + + // No new job → must not mark any in-memory locks + expect(markWorkItemEnqueued).not.toHaveBeenCalled(); + expect(markAgentTypeEnqueued).not.toHaveBeenCalled(); + expect(markRecentlyDispatched).not.toHaveBeenCalled(); + expect(result.decisionReason).toMatch(/active job already running/); + }); + it('falls back to normal dispatch when PM_COALESCE_WINDOW_MS=0 (disable)', async () => { vi.mocked(getCoalesceWindowMs).mockReturnValue(0); vi.mocked(addJob).mockResolvedValue('job-immediate'); From 182b472d7287cf3fb546b2f6756d9289792c1109 Mon Sep 17 00:00:00 2001 From: Cascade Bot Date: Wed, 29 Apr 2026 14:09:01 +0000 Subject: [PATCH 20/22] test(coverage): cover deferred-ack worker path + coalesce-config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 8 unit tests for `getCoalesceWindowMs()` (every branch — default, PM_COALESCE_WINDOW_MS, legacy fallback, precedence, 0/disable, non-numeric, negative, empty) and 9 unit tests for the deferred-ack flow in `dispatchJob` / `postDeferredAck` (Trello/JIRA/Linear pendingAck paths, payload-extractor → contextHint fallback, missing agentType, non-fatal `dispatchPMAck` rejection, undefined-result fallback to the pre-existing ackCommentId, Linear-without-workItemId skip, plus the non-pendingAck Linear routing branch that was wholly uncovered). Closes the codecov/patch gap on PR #1226 — the new BullMQ deferred-ack codepath added in this PR was at 15% (worker-entry.ts) / 0% (coalesce-config.ts). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/pm/coalesce-config.test.ts | 65 +++++ tests/unit/worker-entry.test.ts | 326 ++++++++++++++++++++++++++ 2 files changed, 391 insertions(+) create mode 100644 tests/unit/pm/coalesce-config.test.ts diff --git a/tests/unit/pm/coalesce-config.test.ts b/tests/unit/pm/coalesce-config.test.ts new file mode 100644 index 00000000..ff48c7ce --- /dev/null +++ b/tests/unit/pm/coalesce-config.test.ts @@ -0,0 +1,65 @@ +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { getCoalesceWindowMs } from '../../../src/pm/coalesce-config.js'; + +describe('getCoalesceWindowMs', () => { + const originalCoalesce = process.env.PM_COALESCE_WINDOW_MS; + const originalLegacy = process.env.PM_CREATE_COALESCE_WINDOW_MS; + + beforeEach(() => { + delete process.env.PM_COALESCE_WINDOW_MS; + delete process.env.PM_CREATE_COALESCE_WINDOW_MS; + }); + + afterEach(() => { + if (originalCoalesce === undefined) { + delete process.env.PM_COALESCE_WINDOW_MS; + } else { + process.env.PM_COALESCE_WINDOW_MS = originalCoalesce; + } + if (originalLegacy === undefined) { + delete process.env.PM_CREATE_COALESCE_WINDOW_MS; + } else { + process.env.PM_CREATE_COALESCE_WINDOW_MS = originalLegacy; + } + }); + + it('returns the default 10_000 ms when neither env var is set', () => { + expect(getCoalesceWindowMs()).toBe(10_000); + }); + + it('reads PM_COALESCE_WINDOW_MS when set', () => { + process.env.PM_COALESCE_WINDOW_MS = '2500'; + expect(getCoalesceWindowMs()).toBe(2500); + }); + + it('falls back to legacy PM_CREATE_COALESCE_WINDOW_MS when PM_COALESCE_WINDOW_MS is unset', () => { + process.env.PM_CREATE_COALESCE_WINDOW_MS = '7777'; + expect(getCoalesceWindowMs()).toBe(7777); + }); + + it('prefers PM_COALESCE_WINDOW_MS over the legacy fallback when both are set', () => { + process.env.PM_COALESCE_WINDOW_MS = '1000'; + process.env.PM_CREATE_COALESCE_WINDOW_MS = '9999'; + expect(getCoalesceWindowMs()).toBe(1000); + }); + + it('treats 0 as a valid value to disable coalescing', () => { + process.env.PM_COALESCE_WINDOW_MS = '0'; + expect(getCoalesceWindowMs()).toBe(0); + }); + + it('returns the default when value is non-numeric', () => { + process.env.PM_COALESCE_WINDOW_MS = 'not-a-number'; + expect(getCoalesceWindowMs()).toBe(10_000); + }); + + it('returns the default when value is negative', () => { + process.env.PM_COALESCE_WINDOW_MS = '-100'; + expect(getCoalesceWindowMs()).toBe(10_000); + }); + + it('returns the default when value is empty string', () => { + process.env.PM_COALESCE_WINDOW_MS = ''; + expect(getCoalesceWindowMs()).toBe(10_000); + }); +}); diff --git a/tests/unit/worker-entry.test.ts b/tests/unit/worker-entry.test.ts index 4b3686cd..9402fe35 100644 --- a/tests/unit/worker-entry.test.ts +++ b/tests/unit/worker-entry.test.ts @@ -40,6 +40,21 @@ vi.mock('../../src/triggers/sentry/webhook-handler.js', () => ({ processSentryWebhook: vi.fn().mockResolvedValue(undefined), })); +vi.mock('../../src/triggers/linear/webhook-handler.js', () => ({ + processLinearWebhook: vi.fn().mockResolvedValue(undefined), +})); + +vi.mock('../../src/router/pm-ack-dispatch.js', () => ({ + dispatchPMAck: vi.fn(), +})); + +vi.mock('../../src/router/ackMessageGenerator.js', () => ({ + extractTrelloContext: vi.fn().mockReturnValue(''), + extractJiraContext: vi.fn().mockReturnValue(''), + extractLinearContext: vi.fn().mockReturnValue(''), + generateAckMessage: vi.fn().mockResolvedValue('🔨 Generated ack message'), +})); + vi.mock('../../src/utils/index.js', () => ({ logger: { info: vi.fn(), @@ -83,8 +98,16 @@ vi.mock('../../src/agents/prompts/index.js', () => ({ import { loadProjectConfigById } from '../../src/config/provider.js'; import { getRunById } from '../../src/db/repositories/runsRepository.js'; +import { + extractJiraContext, + extractLinearContext, + extractTrelloContext, + generateAckMessage, +} from '../../src/router/ackMessageGenerator.js'; +import { dispatchPMAck } from '../../src/router/pm-ack-dispatch.js'; import { captureException, flush } from '../../src/sentry.js'; import { processGitHubWebhook, processJiraWebhook } from '../../src/triggers/index.js'; +import { processLinearWebhook } from '../../src/triggers/linear/webhook-handler.js'; import { processSentryWebhook } from '../../src/triggers/sentry/webhook-handler.js'; import { triggerDebugAnalysis } from '../../src/triggers/shared/debug-runner.js'; import { triggerManualRun, triggerRetryRun } from '../../src/triggers/shared/manual-runner.js'; @@ -94,6 +117,7 @@ import { dispatchJob, type GitHubJobData, type JiraJobData, + type LinearJobData, type ManualRunJobData, main, processDashboardJob, @@ -284,6 +308,35 @@ describe('dispatchJob routing', () => { ); }); + it('routes linear job to processLinearWebhook with payload, registry, ackCommentId, triggerResult', async () => { + const mockRegistry = {}; + const jobPayload = { type: 'Issue', data: { id: 'lin-1' } }; + const triggerResult = { matched: true, agentType: 'implementation' } as never; + + const jobData: LinearJobData = { + type: 'linear', + source: 'linear', + payload: jobPayload, + projectId: 'proj-1', + workItemId: 'lin-1', + eventType: 'create/Issue', + receivedAt: '2024-01-01T00:00:00Z', + ackCommentId: 'lin-comment-789', + triggerResult, + }; + + await dispatchJob('job-linear-1', jobData, mockRegistry as never); + + expect(processLinearWebhook).toHaveBeenCalledWith( + jobPayload, + mockRegistry, + 'lin-comment-789', + triggerResult, + ); + // Without pendingAck, the deferred-ack path is NOT taken + expect(dispatchPMAck).not.toHaveBeenCalled(); + }); + it('handles unknown job type by calling captureException with worker_unknown_job tag', async () => { const exitSpy = vi.spyOn(process, 'exit').mockImplementation((code?) => { throw new Error(`process.exit(${code})`); @@ -307,6 +360,279 @@ describe('dispatchJob routing', () => { }); }); +// ── deferred ack tests (postDeferredAck via dispatchJob) ────────────────────── + +describe('dispatchJob - deferred ack (pendingAck=true)', () => { + beforeEach(() => { + vi.mocked(dispatchPMAck).mockReset(); + vi.mocked(extractJiraContext).mockReset().mockReturnValue(''); + vi.mocked(extractLinearContext).mockReset().mockReturnValue(''); + vi.mocked(extractTrelloContext).mockReset().mockReturnValue(''); + vi.mocked(generateAckMessage).mockReset().mockResolvedValue('🔨 Generated ack'); + }); + + it('trello pendingAck: extracts context, generates ack message, posts via dispatchPMAck, passes new commentId to processTrelloWebhook', async () => { + vi.mocked(extractTrelloContext).mockReturnValueOnce('Card: do the thing'); + vi.mocked(generateAckMessage).mockResolvedValueOnce('🔨 Working on the thing'); + vi.mocked(dispatchPMAck).mockResolvedValueOnce({ + commentId: 'trello-deferred-1', + message: '🔨 Working on the thing', + }); + + const jobData: TrelloJobData = { + type: 'trello', + source: 'trello', + payload: { action: { data: { card: { name: 'do the thing' } } } }, + projectId: 'proj-1', + workItemId: 'card-1', + actionType: 'updateCard', + receivedAt: '2024-01-01T00:00:00Z', + pendingAck: true, + ackMessage: 'do the thing', + triggerResult: { agentType: 'implementation' } as never, + }; + + await dispatchJob('job-trello-deferred', jobData, {} as never); + + expect(extractTrelloContext).toHaveBeenCalledWith(jobData.payload); + expect(generateAckMessage).toHaveBeenCalledWith( + 'implementation', + 'Card: do the thing', + 'proj-1', + ); + expect(dispatchPMAck).toHaveBeenCalledWith({ + projectId: 'proj-1', + workItemId: 'card-1', + pmType: 'trello', + message: '🔨 Working on the thing', + agentType: 'implementation', + }); + expect(processTrelloWebhook).toHaveBeenCalledWith( + jobData.payload, + expect.anything(), + 'trello-deferred-1', + jobData.triggerResult, + ); + }); + + it('jira pendingAck: extracts context, generates ack, posts via dispatchPMAck, passes new commentId to processJiraWebhook', async () => { + vi.mocked(extractJiraContext).mockReturnValueOnce('Issue: PROJ-1 — Fix bug'); + vi.mocked(generateAckMessage).mockResolvedValueOnce('🔨 On the bug fix'); + vi.mocked(dispatchPMAck).mockResolvedValueOnce({ + commentId: 'jira-deferred-1', + message: '🔨 On the bug fix', + }); + + const jobData: JiraJobData = { + type: 'jira', + source: 'jira', + payload: { issue: { key: 'PROJ-1', fields: { summary: 'Fix bug' } } }, + projectId: 'proj-1', + issueKey: 'PROJ-1', + webhookEvent: 'jira:issue_updated', + receivedAt: '2024-01-01T00:00:00Z', + pendingAck: true, + ackMessage: 'Fix bug', + triggerResult: { agentType: 'implementation' } as never, + }; + + await dispatchJob('job-jira-deferred', jobData, {} as never); + + expect(extractJiraContext).toHaveBeenCalledWith(jobData.payload); + expect(generateAckMessage).toHaveBeenCalledWith( + 'implementation', + 'Issue: PROJ-1 — Fix bug', + 'proj-1', + ); + expect(dispatchPMAck).toHaveBeenCalledWith({ + projectId: 'proj-1', + workItemId: 'PROJ-1', + pmType: 'jira', + message: '🔨 On the bug fix', + agentType: 'implementation', + }); + expect(processJiraWebhook).toHaveBeenCalledWith( + jobData.payload, + expect.anything(), + 'jira-deferred-1', + jobData.triggerResult, + ); + }); + + it('linear pendingAck: extracts context, generates ack, posts via dispatchPMAck, passes new commentId to processLinearWebhook', async () => { + vi.mocked(extractLinearContext).mockReturnValueOnce('Issue: TEAM-12 — Add feature'); + vi.mocked(generateAckMessage).mockResolvedValueOnce('🔨 Building the feature'); + vi.mocked(dispatchPMAck).mockResolvedValueOnce({ + commentId: 'linear-deferred-1', + message: '🔨 Building the feature', + }); + + const jobData: LinearJobData = { + type: 'linear', + source: 'linear', + payload: { type: 'Issue', data: { id: 'lin-1', title: 'Add feature' } }, + projectId: 'proj-1', + workItemId: 'lin-1', + eventType: 'update/Issue', + receivedAt: '2024-01-01T00:00:00Z', + pendingAck: true, + ackMessage: 'Add feature', + triggerResult: { agentType: 'implementation' } as never, + }; + + await dispatchJob('job-linear-deferred', jobData, {} as never); + + expect(extractLinearContext).toHaveBeenCalledWith(jobData.payload); + expect(generateAckMessage).toHaveBeenCalledWith( + 'implementation', + 'Issue: TEAM-12 — Add feature', + 'proj-1', + ); + expect(dispatchPMAck).toHaveBeenCalledWith({ + projectId: 'proj-1', + workItemId: 'lin-1', + pmType: 'linear', + message: '🔨 Building the feature', + agentType: 'implementation', + }); + expect(processLinearWebhook).toHaveBeenCalledWith( + jobData.payload, + expect.anything(), + 'linear-deferred-1', + jobData.triggerResult, + ); + }); + + it('falls back to ackMessage hint when payload extractor returns empty', async () => { + vi.mocked(extractJiraContext).mockReturnValueOnce(''); + vi.mocked(generateAckMessage).mockResolvedValueOnce('🔨 generic'); + vi.mocked(dispatchPMAck).mockResolvedValueOnce({ + commentId: 'jira-fallback-1', + message: '🔨 generic', + }); + + const jobData: JiraJobData = { + type: 'jira', + source: 'jira', + payload: { issue: {} }, + projectId: 'proj-1', + issueKey: 'PROJ-1', + webhookEvent: 'jira:issue_updated', + receivedAt: '2024-01-01T00:00:00Z', + pendingAck: true, + ackMessage: 'Fallback Title', + triggerResult: { agentType: 'implementation' } as never, + }; + + await dispatchJob('job-jira-fallback', jobData, {} as never); + + expect(generateAckMessage).toHaveBeenCalledWith( + 'implementation', + 'Issue: Fallback Title', + 'proj-1', + ); + }); + + it('passes empty agentType when triggerResult.agentType is missing', async () => { + vi.mocked(extractTrelloContext).mockReturnValueOnce('Card: x'); + vi.mocked(generateAckMessage).mockResolvedValueOnce('msg'); + vi.mocked(dispatchPMAck).mockResolvedValueOnce({ commentId: 't', message: 'msg' }); + + const jobData: TrelloJobData = { + type: 'trello', + source: 'trello', + payload: {}, + projectId: 'proj-1', + workItemId: 'card-1', + actionType: 'updateCard', + receivedAt: '2024-01-01T00:00:00Z', + pendingAck: true, + // no triggerResult and no ackMessage + }; + + await dispatchJob('job-no-agent', jobData, {} as never); + + expect(generateAckMessage).toHaveBeenCalledWith('', 'Card: x', 'proj-1'); + expect(dispatchPMAck).toHaveBeenCalledWith(expect.objectContaining({ agentType: undefined })); + }); + + it('preserves original ackCommentId when dispatchPMAck rejects (non-fatal)', async () => { + vi.mocked(extractJiraContext).mockReturnValueOnce('Issue: x'); + vi.mocked(dispatchPMAck).mockRejectedValueOnce(new Error('PM API down')); + + const jobData: JiraJobData = { + type: 'jira', + source: 'jira', + payload: {}, + projectId: 'proj-1', + issueKey: 'PROJ-2', + webhookEvent: 'jira:issue_updated', + receivedAt: '2024-01-01T00:00:00Z', + ackCommentId: 'pre-existing', + pendingAck: true, + triggerResult: { agentType: 'implementation' } as never, + }; + + // Must not throw — deferred ack failure is non-fatal + await expect(dispatchJob('job-ack-fail', jobData, {} as never)).resolves.toBeUndefined(); + + // Falls back to the pre-existing ackCommentId from the job data + expect(processJiraWebhook).toHaveBeenCalledWith( + jobData.payload, + expect.anything(), + 'pre-existing', + jobData.triggerResult, + ); + }); + + it('preserves original ackCommentId when dispatchPMAck returns undefined (no comment posted)', async () => { + vi.mocked(extractTrelloContext).mockReturnValueOnce('Card: x'); + vi.mocked(dispatchPMAck).mockResolvedValueOnce(undefined); + + const jobData: TrelloJobData = { + type: 'trello', + source: 'trello', + payload: {}, + projectId: 'proj-1', + workItemId: 'card-1', + actionType: 'updateCard', + receivedAt: '2024-01-01T00:00:00Z', + ackCommentId: 'fallback-comment', + pendingAck: true, + triggerResult: { agentType: 'implementation' } as never, + }; + + await dispatchJob('job-undef-ack', jobData, {} as never); + + expect(processTrelloWebhook).toHaveBeenCalledWith( + jobData.payload, + expect.anything(), + 'fallback-comment', + jobData.triggerResult, + ); + }); + + it('linear pendingAck without workItemId: skips deferred ack entirely', async () => { + const jobData: LinearJobData = { + type: 'linear', + source: 'linear', + payload: {}, + projectId: 'proj-1', + // workItemId is missing + eventType: 'create/Comment', + receivedAt: '2024-01-01T00:00:00Z', + pendingAck: true, + triggerResult: { agentType: 'implementation' } as never, + }; + + await dispatchJob('job-linear-no-id', jobData, {} as never); + + // Without workItemId, the deferred-ack branch is skipped + expect(dispatchPMAck).not.toHaveBeenCalled(); + expect(processLinearWebhook).toHaveBeenCalled(); + }); +}); + // ── processDashboardJob tests ───────────────────────────────────────────────── describe('processDashboardJob - manual-run', () => { From a08b5374be98cabc31dab3f3b5b97d1a42ef5429 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 15:18:20 +0000 Subject: [PATCH 21/22] fix(router): capture coalesce-schedule failures to Sentry + rename ackMessage hint field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses both items from the (failed-to-submit) review run c63c7835: 1. **Should-fix: Sentry capture gap in coalesce-schedule catch block.** The `scheduleCoalescedJob` failure path in `webhook-processor.ts` logged via `logger.error` but did not call `captureException`. Other dispatch- failure paths flow through BullMQ retry → `worker.on('failed')` → `releaseLocksForFailedJob` → Sentry (per spec 015 plan 1), but this catch handles a Redis-side failure BEFORE the job is enqueued, so it bypasses that pipeline entirely. Redis hiccups during coalesced scheduling were silently escaping observability. Now captures under stable tag `coalesce_schedule_failure` with `{projectId, workItemId, agentType, coalesceKey, adapterType}` extras. 2. **Nit: rename `ackMessage` → `ackContextHint` on the new job interfaces.** The PR added an `ackMessage?: string` field on TrelloJob/JiraJob/LinearJob (and corresponding `*JobData` interfaces in worker-entry.ts) to store the `workItemTitle` as a context hint for `generateAckMessage()` at deferred- ack fire time — explicitly NOT the literal comment text, per the JSDoc. The name read like the literal text and required reading the JSDoc to avoid misuse. Renamed to `ackContextHint` for self-documentation. The pre-existing `GitHubJob.ackMessage` field is left unchanged because that one IS the literal comment text. Renamed 3 type fields (Trello/JIRA/Linear job interfaces in queue.ts + matching shapes in worker-entry.ts), 1 set site in webhook-processor.ts, 3 read sites in worker-entry.ts's deferred-ack helper invocations, plus the JSDoc reference and 4 test mocks (test names included). Full unit suite green (473 files / 8700 tests). Typecheck + lint clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/router/queue.ts | 27 ++++++++++++++++++------ src/router/webhook-processor.ts | 18 +++++++++++++++- src/worker-entry.ts | 37 +++++++++++++++++++++++---------- tests/unit/worker-entry.test.ts | 12 +++++------ 4 files changed, 70 insertions(+), 24 deletions(-) diff --git a/src/router/queue.ts b/src/router/queue.ts index 0a0a016d..7bf8b5c3 100644 --- a/src/router/queue.ts +++ b/src/router/queue.ts @@ -23,8 +23,13 @@ export interface TrelloJob { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ - ackMessage?: string; + /** + * Work-item title stored as a context hint, passed to `generateAckMessage` + * at deferred-ack fire time. NOT the literal comment text — the worker + * generates the actual ack message via the role-aware LLM path. Renamed + * from `ackMessage` (which read like the literal text) for clarity. + */ + ackContextHint?: string; } export interface GitHubJob { @@ -51,8 +56,13 @@ export interface JiraJob { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ - ackMessage?: string; + /** + * Work-item title stored as a context hint, passed to `generateAckMessage` + * at deferred-ack fire time. NOT the literal comment text — the worker + * generates the actual ack message via the role-aware LLM path. Renamed + * from `ackMessage` (which read like the literal text) for clarity. + */ + ackContextHint?: string; } export interface SentryJob { @@ -78,8 +88,13 @@ export interface LinearJob { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ - ackMessage?: string; + /** + * Work-item title stored as a context hint, passed to `generateAckMessage` + * at deferred-ack fire time. NOT the literal comment text — the worker + * generates the actual ack message via the role-aware LLM path. Renamed + * from `ackMessage` (which read like the literal text) for clarity. + */ + ackContextHint?: string; } export type CascadeJob = TrelloJob | GitHubJob | JiraJob | SentryJob | LinearJob; diff --git a/src/router/webhook-processor.ts b/src/router/webhook-processor.ts index 5c96c980..80baa2e1 100644 --- a/src/router/webhook-processor.ts +++ b/src/router/webhook-processor.ts @@ -160,7 +160,7 @@ export async function processRouterWebhook( // full payload context extractor returns nothing. if (job.type === 'trello' || job.type === 'jira' || job.type === 'linear') { job.pendingAck = true; - job.ackMessage = result.workItemTitle ?? undefined; + job.ackContextHint = result.workItemTitle ?? undefined; } // Schedule as a delayed BullMQ job; supersedes any prior pending job @@ -223,6 +223,22 @@ export async function processRouterWebhook( } } catch (err) { result.onBlocked?.(); + // Other dispatch-failure paths flow through BullMQ retry → + // `worker.on('failed')` → `releaseLocksForFailedJob` → Sentry + // (per spec 015 plan 1). This catch handles a Redis-side failure + // BEFORE the job is enqueued, so it bypasses that pipeline. Capture + // to Sentry directly under a stable tag so coalesce-scheduling + // failures don't silently escape observability. + captureException(err instanceof Error ? err : new Error(String(err)), { + tags: { source: 'coalesce_schedule_failure' }, + extra: { + projectId: project.id, + workItemId: result.workItemId, + agentType: result.agentType, + coalesceKey: result.coalesceKey, + adapterType: adapter.type, + }, + }); logger.error(`Failed to schedule coalesced ${adapter.type} job`, { error: String(err), coalesceKey: result.coalesceKey, diff --git a/src/worker-entry.ts b/src/worker-entry.ts index 3533388f..3cfbe47f 100644 --- a/src/worker-entry.ts +++ b/src/worker-entry.ts @@ -56,8 +56,13 @@ export interface TrelloJobData { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ - ackMessage?: string; + /** + * Work-item title stored as a context hint, passed to `generateAckMessage` + * at deferred-ack fire time. NOT the literal comment text — the worker + * generates the actual ack message via the role-aware LLM path. Renamed + * from `ackMessage` (which read like the literal text) for clarity. + */ + ackContextHint?: string; } export interface GitHubJobData { @@ -84,8 +89,13 @@ export interface JiraJobData { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ - ackMessage?: string; + /** + * Work-item title stored as a context hint, passed to `generateAckMessage` + * at deferred-ack fire time. NOT the literal comment text — the worker + * generates the actual ack message via the role-aware LLM path. Renamed + * from `ackMessage` (which read like the literal text) for clarity. + */ + ackContextHint?: string; } export interface SentryJobData { @@ -112,8 +122,13 @@ export interface LinearJobData { triggerResult?: TriggerResult; /** When true, the worker must post the ack comment before processing (deferred ack). */ pendingAck?: boolean; - /** workItemTitle stored as a context hint for generateAckMessage at fire time. NOT the literal comment text. */ - ackMessage?: string; + /** + * Work-item title stored as a context hint, passed to `generateAckMessage` + * at deferred-ack fire time. NOT the literal comment text — the worker + * generates the actual ack message via the role-aware LLM path. Renamed + * from `ackMessage` (which read like the literal text) for clarity. + */ + ackContextHint?: string; } export interface ManualRunJobData { @@ -209,8 +224,8 @@ export async function processDashboardJob(jobId: string, jobData: DashboardJobDa * posts it via `dispatchPMAck`. Returns the new comment ID string, or * `undefined` if the ack could not be posted (non-fatal). * - * The stored `ackMessage` field contains the `workItemTitle` as a context hint - * fallback when payload extraction returns nothing. + * The stored `ackContextHint` field contains the `workItemTitle` as a fallback + * for `generateAckMessage` when payload extraction returns nothing. */ async function postDeferredAck( projectId: string, @@ -274,7 +289,7 @@ export async function dispatchJob( 'trello', jobData.payload, jobData.triggerResult?.agentType ?? undefined, - jobData.ackMessage, + jobData.ackContextHint, )) ?? trelloAckCommentId; } await processTrelloWebhook( @@ -321,7 +336,7 @@ export async function dispatchJob( 'jira', jobData.payload, jobData.triggerResult?.agentType ?? undefined, - jobData.ackMessage, + jobData.ackContextHint, )) ?? jiraAckCommentId; } await processJiraWebhook( @@ -366,7 +381,7 @@ export async function dispatchJob( 'linear', jobData.payload, jobData.triggerResult?.agentType ?? undefined, - jobData.ackMessage, + jobData.ackContextHint, )) ?? linearAckCommentId; } await processLinearWebhook( diff --git a/tests/unit/worker-entry.test.ts b/tests/unit/worker-entry.test.ts index 9402fe35..3ce0cb4c 100644 --- a/tests/unit/worker-entry.test.ts +++ b/tests/unit/worker-entry.test.ts @@ -388,7 +388,7 @@ describe('dispatchJob - deferred ack (pendingAck=true)', () => { actionType: 'updateCard', receivedAt: '2024-01-01T00:00:00Z', pendingAck: true, - ackMessage: 'do the thing', + ackContextHint: 'do the thing', triggerResult: { agentType: 'implementation' } as never, }; @@ -432,7 +432,7 @@ describe('dispatchJob - deferred ack (pendingAck=true)', () => { webhookEvent: 'jira:issue_updated', receivedAt: '2024-01-01T00:00:00Z', pendingAck: true, - ackMessage: 'Fix bug', + ackContextHint: 'Fix bug', triggerResult: { agentType: 'implementation' } as never, }; @@ -476,7 +476,7 @@ describe('dispatchJob - deferred ack (pendingAck=true)', () => { eventType: 'update/Issue', receivedAt: '2024-01-01T00:00:00Z', pendingAck: true, - ackMessage: 'Add feature', + ackContextHint: 'Add feature', triggerResult: { agentType: 'implementation' } as never, }; @@ -503,7 +503,7 @@ describe('dispatchJob - deferred ack (pendingAck=true)', () => { ); }); - it('falls back to ackMessage hint when payload extractor returns empty', async () => { + it('falls back to ackContextHint when payload extractor returns empty', async () => { vi.mocked(extractJiraContext).mockReturnValueOnce(''); vi.mocked(generateAckMessage).mockResolvedValueOnce('🔨 generic'); vi.mocked(dispatchPMAck).mockResolvedValueOnce({ @@ -520,7 +520,7 @@ describe('dispatchJob - deferred ack (pendingAck=true)', () => { webhookEvent: 'jira:issue_updated', receivedAt: '2024-01-01T00:00:00Z', pendingAck: true, - ackMessage: 'Fallback Title', + ackContextHint: 'Fallback Title', triggerResult: { agentType: 'implementation' } as never, }; @@ -547,7 +547,7 @@ describe('dispatchJob - deferred ack (pendingAck=true)', () => { actionType: 'updateCard', receivedAt: '2024-01-01T00:00:00Z', pendingAck: true, - // no triggerResult and no ackMessage + // no triggerResult and no ackContextHint }; await dispatchJob('job-no-agent', jobData, {} as never); From dfd52714d598bd1b68c45c067e674481607fdd89 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 15:49:01 +0000 Subject: [PATCH 22/22] fix(router): sanitize Docker-invalid chars in jobId when building worker container name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live regression on prod: PR #1226 introduced coalesced-job IDs shaped `coalesce:${projectId}:${workItemId}` (e.g. `coalesce:ucho:MNG-413`). `spawnWorker()` blindly concatenated this into the Docker container name (`cascade-worker-coalesce:ucho:MNG-413`), which Docker rejected with HTTP 400 "bad parameter — Invalid container name (...), only [a-zA-Z0-9][a-zA-Z0-9_.-] are allowed". Every coalesced job that fired post-deploy failed at `docker createContainer`, the worker never started, the run was lost. Two fixes considered: - Change the BullMQ jobId format upstream in `queue.ts` to use a Docker-safe separator. Risk: jobIds appear in logs / dedup keys / metrics; switching the canonical form widens the blast radius. - Sanitize at the spawn site only. Lower risk: the original jobId stays intact for logs and dedup; only the Docker-name derivation gets normalized. Chose the second. Replace any character not in `[a-zA-Z0-9_.-]` with `_`. The leading char is `cascade-worker-...` so the Docker-required leading `[a-zA-Z0-9]` is already satisfied. Tests: - New regression pin: jobId `coalesce:ucho:MNG-413` → container name `cascade-worker-coalesce_ucho_MNG-413` (the live failing case). - New positive pin: ordinary jobIds (alphanumeric + dashes/dots/underscores) pass through unchanged. Hotfix scope. Going straight to dev with a tight diff. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/router/container-manager.ts | 8 ++++- tests/unit/router/container-manager.test.ts | 37 +++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/router/container-manager.ts b/src/router/container-manager.ts index ee75e168..79737e36 100644 --- a/src/router/container-manager.ts +++ b/src/router/container-manager.ts @@ -480,7 +480,13 @@ async function createAndMonitorContainer( */ export async function spawnWorker(job: Job): Promise { const jobId = job.id ?? `unknown-${Date.now()}`; - const containerName = `cascade-worker-${jobId}`; + // Docker container names accept only `[a-zA-Z0-9][a-zA-Z0-9_.-]`. PR #1226 + // introduced coalesced-job IDs shaped `coalesce:${projectId}:${workItemId}` + // where the colons crashed `createContainer` with HTTP 400 — every coalesced + // job that fired post-deploy failed to spawn. Sanitize disallowed chars to + // underscores; the original `jobId` stays intact in logs and dedup keys. + const containerSafeJobId = jobId.replace(/[^a-zA-Z0-9_.-]/g, '_'); + const containerName = `cascade-worker-${containerSafeJobId}`; // Resolve projectId once — used for both credential env and work-item lock tracking const projectId = await extractProjectIdFromJob(job.data); diff --git a/tests/unit/router/container-manager.test.ts b/tests/unit/router/container-manager.test.ts index 79840f64..01a03e6a 100644 --- a/tests/unit/router/container-manager.test.ts +++ b/tests/unit/router/container-manager.test.ts @@ -291,6 +291,43 @@ describe('spawnWorker', () => { resolveWait(); }); + it('sanitizes Docker-invalid characters in jobId when building the container name', async () => { + // Live regression: PR #1226 introduced jobIds shaped `coalesce:${projectId}:${workItemId}` + // (e.g. `coalesce:ucho:MNG-413`). Docker container names allow only + // `[a-zA-Z0-9][a-zA-Z0-9_.-]`, so the colons crashed `createContainer` + // with HTTP 400 "bad parameter — Invalid container name". Every + // coalesced job that fired post-deploy failed to spawn its worker. + // Sanitization replaces the rejected chars (colons here, plus any future + // shape-shift) with underscores. + const { resolveWait } = setupMockContainer(); + + await spawnWorker(makeJob({ id: 'coalesce:ucho:MNG-413' }) as never); + + expect(mockDockerCreateContainer).toHaveBeenCalledWith( + expect.objectContaining({ + name: 'cascade-worker-coalesce_ucho_MNG-413', + }), + ); + + resolveWait(); + }); + + it('passes through Docker-safe jobIds unchanged in the container name', async () => { + // Regression pin: ordinary jobIds (BullMQ default UUIDs, plain strings, + // hyphens, dots, underscores) must not be mangled by the sanitizer. + const { resolveWait } = setupMockContainer(); + + await spawnWorker(makeJob({ id: 'github-1234567890abcdef' }) as never); + + expect(mockDockerCreateContainer).toHaveBeenCalledWith( + expect.objectContaining({ + name: 'cascade-worker-github-1234567890abcdef', + }), + ); + + resolveWait(); + }); + it('cleans up worker after container exits', async () => { const { resolveWait } = setupMockContainer();