diff --git a/.ai/active/SPRINT_PACKET.md b/.ai/active/SPRINT_PACKET.md index e8e8691..b7bcd90 100644 --- a/.ai/active/SPRINT_PACKET.md +++ b/.ai/active/SPRINT_PACKET.md @@ -2,7 +2,7 @@ ## Sprint Title -Sprint 5F: Artifact Chunk Compile Integration V0 +Sprint 5H: Semantic Artifact Chunk Retrieval Primitive ## Sprint Type @@ -10,15 +10,15 @@ feature ## Sprint Reason -Milestone 5 now has deterministic workspace boundaries, explicit artifact records, local text-artifact ingestion, and lexical chunk retrieval. The next safe step is to make those retrieved chunks available to the existing context compiler so document-aware responses can build on durable artifact data instead of isolated read APIs. +Milestone 5 now has deterministic artifact chunk ingestion, lexical retrieval, compile-path lexical artifact inclusion, and durable artifact-chunk embedding storage. The next safe step is a direct semantic retrieval primitive over those stored chunk embeddings, while still deferring compile-path semantic use, hybrid artifact retrieval, connectors, and UI. ## Sprint Intent -Extend the existing context-compile path so it can optionally retrieve and include relevant artifact chunks using the shipped lexical artifact-chunk retrieval seam, without yet adding embeddings, semantic retrieval, Gmail/Calendar connectors, or UI. +Add the first read-side semantic retrieval primitive over stored `task_artifact_chunk_embeddings`, with explicit embedding-config selection and deterministic result ordering, without yet wiring semantic artifact retrieval into the compile path or combining it with lexical artifact retrieval. ## Git Instructions -- Branch Name: `codex/sprint-5f-artifact-chunk-compile-integration-v0` +- Branch Name: `codex/sprint-5h-semantic-artifact-chunk-retrieval` - Base Branch: `main` - PR Strategy: one sprint branch, one PR, no stacked PRs unless Control Tower explicitly opens a follow-up sprint - Merge Policy: squash merge only after reviewer `PASS` and explicit Control Tower merge approval @@ -29,100 +29,104 @@ Extend the existing context-compile path so it can optionally retrieve and inclu - Sprint 5C shipped explicit task-artifact registration. - Sprint 5D shipped deterministic local artifact ingestion into durable chunk rows. - Sprint 5E shipped deterministic lexical retrieval over those chunk rows. -- The next narrow Milestone 5 seam is compile-path integration of those persisted chunk results only, so document-aware context can land without jumping into semantic retrieval, connector work, or richer parsing. +- Sprint 5F shipped compile-path lexical artifact chunk inclusion. +- Sprint 5G shipped durable artifact-chunk embedding persistence tied to existing embedding configs. +- The next narrow Milestone 5 seam is semantic artifact retrieval over those stored vectors only, so later compile adoption and hybrid artifact retrieval can build on an explicit retrieval primitive instead of hidden assumptions. ## In Scope - Define typed contracts for: - - optional artifact-retrieval input on compile requests - - artifact-chunk result items inside the compiled context pack - - artifact-retrieval summary metadata inside compile responses - - artifact-retrieval trace payloads -- Extend the compile path so it can: - - accept an explicit artifact retrieval request scoped to one visible task or one visible artifact - - reuse the existing lexical artifact-chunk retrieval seam - - include retrieved artifact chunks in a separate context-pack section - - record artifact chunk include/exclude decisions in `trace_events` - - preserve deterministic output for the same stored data and inputs -- Ensure compile behavior: - - leaves current continuity, memory, entity, and other context sections intact - - does not merge artifact chunks with memory/entity sections - - excludes non-ingested artifacts - - scopes strictly by user ownership - - uses deterministic ordering and explicit per-section limits + - semantic artifact retrieval requests + - semantic artifact retrieval result items + - retrieval summary metadata +- Implement a narrow semantic retrieval seam that: + - accepts an explicit `embedding_config_id` + - accepts a caller-supplied query vector + - searches only durable `task_artifact_chunk_embeddings` + - joins to visible `task_artifact_chunks` and visible `task_artifacts` + - scopes retrieval by the current user plus one explicit task or one explicit artifact + - validates query-vector dimension against the chosen embedding config + - computes similarity using the stored vectors already persisted in the repo + - returns deterministic ordered chunk results with explicit score metadata + - excludes artifacts that are not yet ingested +- Implement the minimal API or service paths needed for: + - semantic retrieval for one task + - semantic retrieval for one artifact when the caller wants a narrower scope - Add unit and integration tests for: - - compile request validation for artifact retrieval input - - deterministic artifact-chunk section ordering + - dimension validation + - deterministic retrieval ordering and tie-breaking + - scoped retrieval by task and by artifact + - empty-result behavior - exclusion of non-ingested artifacts - - trace logging for included and excluded artifact chunks - - per-user isolation through the compile path - - response-shape stability for the new artifact-chunk section + - per-user isolation + - stable response shape ## Out of Scope -- No embeddings for artifact chunks. -- No semantic retrieval or reranking for artifact chunks. -- No compile-path merge between artifact chunks and memory/entity sections. -- No PDF, DOCX, OCR, or rich document parsing beyond the already-shipped text ingestion seam. +- No compile-path semantic artifact retrieval yet. +- No hybrid lexical plus semantic artifact retrieval. +- No reranking layer beyond direct similarity ordering. +- No model or external API calls to generate query embeddings. +- No richer document parsing beyond the already-shipped local text ingestion seam. - No Gmail or Calendar connector scope. - No runner-style orchestration. - No UI work. ## Required Deliverables -- Stable compile-request and compile-response contract updates for artifact chunk retrieval input and output. -- Compile-path integration with the existing lexical artifact-chunk retrieval seam. -- Trace coverage for artifact retrieval decisions inside compile runs. -- Unit and integration coverage for compile-path artifact behavior, ordering, exclusion rules, and isolation. +- Stable semantic artifact retrieval request and response contracts. +- Minimal deterministic semantic retrieval path over existing `task_artifact_chunk_embeddings`. +- Unit and integration coverage for ordering, validation, scoping, exclusion rules, and isolation. - Updated `BUILD_REPORT.md` with exact verification results and explicit deferred scope. ## Acceptance Criteria -- `POST /v0/context/compile` can optionally accept artifact retrieval input and return a separate artifact-chunk section in the context pack. -- Compile-path artifact retrieval uses only durable `task_artifact_chunks` rows already persisted in the repo. -- Non-ingested artifacts are excluded from compile-path artifact results. -- Artifact include/exclude decisions are persisted in `trace_events`. -- Result ordering is deterministic within the artifact-chunk section. +- A client can submit a query vector plus `embedding_config_id` and retrieve relevant visible artifact chunks for one task. +- A client can submit a query vector plus `embedding_config_id` and retrieve relevant visible artifact chunks for one artifact. +- Retrieval uses only durable `task_artifact_chunk_embeddings`, `task_artifact_chunks`, and artifact records already persisted in the repo. +- Retrieval rejects missing configs, dimension mismatches, and cross-user access deterministically. +- Non-ingested artifacts are excluded from semantic retrieval results. +- Result ordering is deterministic and documented. - `./.venv/bin/python -m pytest tests/unit` passes. - `./.venv/bin/python -m pytest tests/integration` passes. -- No embeddings, semantic retrieval, connector, runner, UI, or broader side-effect scope enters the sprint. +- No compile integration changes, hybrid retrieval, connector, runner, UI, or broader side-effect scope enters the sprint. ## Implementation Constraints -- Keep compile integration narrow and boring. -- Reuse the existing artifact retrieval seam; do not read raw files during compile. -- Keep artifact chunks in a separate response section from memory/entity context. -- Do not introduce semantic retrieval, embeddings, or ranking in this sprint. -- Keep scope explicit: one task or one artifact retrieval scope per compile request. +- Keep semantic retrieval narrow and boring. +- Reuse existing embedding configs and durable artifact chunk embeddings; do not introduce a second embedding store. +- Use explicit caller-selected config and query vector input; do not auto-pick configs. +- Keep scope explicit: one task or one artifact per request. +- Do not merge semantic artifact retrieval into the main compiler in the same sprint. ## Suggested Work Breakdown -1. Define compile contract updates for optional artifact retrieval input and output. -2. Integrate the existing lexical artifact-chunk retrieval seam into the compile path. -3. Add artifact result summaries and trace-event payloads. -4. Preserve current context sections while adding a separate artifact-chunk section. +1. Define semantic artifact retrieval request and response contracts. +2. Implement deterministic similarity search over existing artifact chunk embeddings. +3. Add explicit task-scoped and artifact-scoped semantic retrieval paths. +4. Enforce config validation, non-ingested exclusion, and current-user isolation. 5. Add unit and integration tests. 6. Update `BUILD_REPORT.md` with executed verification. ## Build Report Requirements `BUILD_REPORT.md` must include: -- the exact compile contract changes introduced -- the artifact retrieval matching and ordering rule used +- the exact semantic artifact retrieval contracts introduced +- the similarity metric and ordering rule used - exact commands run - unit and integration test results -- one example compile request and response showing the artifact-chunk section -- one example of artifact-retrieval trace events inside one compile run +- one example task-scoped semantic retrieval response +- one example artifact-scoped semantic retrieval response - what remains intentionally deferred to later milestones ## Review Focus `REVIEW_REPORT.md` should verify: -- the sprint stayed limited to compile-path artifact chunk integration -- artifact retrieval reuses durable chunk rows and the existing lexical retrieval seam -- ordering, exclusion rules, trace visibility, and isolation are test-backed -- no hidden embeddings, semantic retrieval, connector, runner, UI, or broader side-effect scope entered the sprint +- the sprint stayed limited to the semantic artifact chunk retrieval primitive +- retrieval is explicit-config, durable-source-only, and validation-backed +- ordering, exclusion rules, and isolation are test-backed +- no hidden compile integration changes, hybrid retrieval, connector, runner, UI, or broader side-effect scope entered the sprint ## Exit Condition -This sprint is complete when the repo can optionally include retrieved artifact chunks inside `POST /v0/context/compile`, trace those inclusion decisions, and verify the full path with Postgres-backed tests, while still deferring semantic retrieval, embeddings, connector work, and UI. +This sprint is complete when the repo can retrieve relevant ingested artifact chunks through a deterministic semantic read path scoped to one task or one artifact, verify the full path with Postgres-backed tests, and still defer compile-path semantic use, hybrid artifact retrieval, connectors, and UI. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 0e9f72d..0d939f7 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -2,16 +2,16 @@ ## Current Implemented Slice -AliceBot now implements the accepted repo slice through Sprint 5G. The shipped backend includes: +AliceBot now implements the accepted repo slice through Sprint 5H. The shipped backend includes: - foundation continuity storage over `users`, `threads`, `sessions`, and append-only `events` - deterministic tracing and context compilation over durable continuity, memory, entity, and entity-edge records - governed memory admission, explicit-preference extraction, memory review labels, review queue reads, evaluation summary reads, explicit embedding config and memory-embedding storage, direct semantic retrieval, and deterministic hybrid compile-path memory merge - deterministic prompt assembly and one no-tools response path that persists assistant replies as immutable continuity events - user-scoped consents, policies, policy evaluation, tool registry, allowlist evaluation, tool routing, approval request persistence, approval resolution, approved-only proxy execution through the in-process `proxy.echo` handler, durable execution review, and execution-budget lifecycle plus enforcement -- durable `tasks`, `task_steps`, `task_workspaces`, `task_artifacts`, `task_artifact_chunks`, and `task_artifact_chunk_embeddings`, deterministic task-step sequencing, explicit task-step transitions, explicit manual continuation with lineage through `parent_step_id`, `source_approval_id`, and `source_execution_id`, explicit `tool_executions.task_step_id` linkage for execution synchronization, deterministic rooted local task-workspace provisioning, explicit rooted local artifact registration, deterministic local text-artifact ingestion into durable chunk rows, deterministic lexical artifact-chunk retrieval over durable chunk rows, optional compile-path artifact chunk inclusion as a separate context section, and explicit user-scoped artifact-chunk embedding persistence tied to existing embedding configs +- durable `tasks`, `task_steps`, `task_workspaces`, `task_artifacts`, `task_artifact_chunks`, and `task_artifact_chunk_embeddings`, deterministic task-step sequencing, explicit task-step transitions, explicit manual continuation with lineage through `parent_step_id`, `source_approval_id`, and `source_execution_id`, explicit `tool_executions.task_step_id` linkage for execution synchronization, deterministic rooted local task-workspace provisioning, explicit rooted local artifact registration, deterministic local text-artifact ingestion into durable chunk rows, deterministic lexical artifact-chunk retrieval over durable chunk rows, optional compile-path artifact chunk inclusion as a separate context section, explicit user-scoped artifact-chunk embedding persistence tied to existing embedding configs, and explicit task-scoped or artifact-scoped semantic artifact-chunk retrieval over those durable embeddings -The current multi-step boundary is narrow and explicit. Manual continuation is implemented and review-passed. Approval resolution and proxy execution now both use explicit task-step linkage rather than first-step inference. Task workspaces are now implemented only as deterministic rooted local boundaries, and task artifacts are now implemented only as explicit rooted local-file registrations, narrow deterministic text ingestion under those workspaces, lexical retrieval over persisted chunk rows, optional compile-path inclusion of retrieved artifact chunks in a separate response section, and explicit artifact-chunk embedding storage tied to existing embedding configs. Broader runner-style orchestration, automatic multi-step progression, artifact-chunk semantic retrieval, rich-document parsing, connectors, and new side-effect surfaces are still planned later and must not be described as live behavior. +The current multi-step boundary is narrow and explicit. Manual continuation is implemented and review-passed. Approval resolution and proxy execution now both use explicit task-step linkage rather than first-step inference. Task workspaces are now implemented only as deterministic rooted local boundaries, and task artifacts are now implemented only as explicit rooted local-file registrations, narrow deterministic text ingestion under those workspaces, lexical retrieval over persisted chunk rows, optional compile-path inclusion of retrieved artifact chunks in a separate response section, explicit artifact-chunk embedding storage tied to existing embedding configs, and direct semantic retrieval over those durable artifact-chunk embeddings for one visible task or one visible artifact at a time. Broader runner-style orchestration, automatic multi-step progression, compile-path semantic artifact use, hybrid artifact retrieval, richer document parsing, connectors, and new side-effect surfaces are still planned later and must not be described as live behavior. ## Implemented Now @@ -24,7 +24,7 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i - memory and retrieval: `POST /v0/memories/admit`, `POST /v0/memories/extract-explicit-preferences`, `GET /v0/memories`, `GET /v0/memories/review-queue`, `GET /v0/memories/evaluation-summary`, `POST /v0/memories/semantic-retrieval`, `GET /v0/memories/{memory_id}`, `GET /v0/memories/{memory_id}/revisions`, `POST /v0/memories/{memory_id}/labels`, `GET /v0/memories/{memory_id}/labels` - embeddings and graph seams: `POST /v0/embedding-configs`, `GET /v0/embedding-configs`, `POST /v0/memory-embeddings`, `GET /v0/memories/{memory_id}/embeddings`, `GET /v0/memory-embeddings/{memory_embedding_id}`, `POST /v0/entities`, `GET /v0/entities`, `GET /v0/entities/{entity_id}`, `POST /v0/entity-edges`, `GET /v0/entities/{entity_id}/edges` - governance: `POST /v0/consents`, `GET /v0/consents`, `POST /v0/policies`, `GET /v0/policies`, `GET /v0/policies/{policy_id}`, `POST /v0/policies/evaluate`, `POST /v0/tools`, `GET /v0/tools`, `GET /v0/tools/{tool_id}`, `POST /v0/tools/allowlist/evaluate`, `POST /v0/tools/route`, `POST /v0/approvals/requests`, `GET /v0/approvals`, `GET /v0/approvals/{approval_id}`, `POST /v0/approvals/{approval_id}/approve`, `POST /v0/approvals/{approval_id}/reject`, `POST /v0/approvals/{approval_id}/execute` -- task and execution review: `GET /v0/tasks`, `GET /v0/tasks/{task_id}`, `POST /v0/tasks/{task_id}/workspace`, `GET /v0/task-workspaces`, `GET /v0/task-workspaces/{task_workspace_id}`, `POST /v0/task-workspaces/{task_workspace_id}/artifacts`, `GET /v0/task-artifacts`, `GET /v0/task-artifacts/{task_artifact_id}`, `POST /v0/task-artifacts/{task_artifact_id}/ingest`, `GET /v0/task-artifacts/{task_artifact_id}/chunks`, `POST /v0/tasks/{task_id}/artifact-chunks/retrieve`, `POST /v0/task-artifacts/{task_artifact_id}/chunks/retrieve`, `POST /v0/task-artifact-chunk-embeddings`, `GET /v0/task-artifacts/{task_artifact_id}/chunk-embeddings`, `GET /v0/task-artifact-chunks/{task_artifact_chunk_id}/embeddings`, `GET /v0/task-artifact-chunk-embeddings/{task_artifact_chunk_embedding_id}`, `GET /v0/tasks/{task_id}/steps`, `GET /v0/task-steps/{task_step_id}`, `POST /v0/tasks/{task_id}/steps`, `POST /v0/task-steps/{task_step_id}/transition`, `POST /v0/execution-budgets`, `GET /v0/execution-budgets`, `GET /v0/execution-budgets/{execution_budget_id}`, `POST /v0/execution-budgets/{execution_budget_id}/deactivate`, `POST /v0/execution-budgets/{execution_budget_id}/supersede`, `GET /v0/tool-executions`, `GET /v0/tool-executions/{execution_id}` +- task and execution review: `GET /v0/tasks`, `GET /v0/tasks/{task_id}`, `POST /v0/tasks/{task_id}/workspace`, `GET /v0/task-workspaces`, `GET /v0/task-workspaces/{task_workspace_id}`, `POST /v0/task-workspaces/{task_workspace_id}/artifacts`, `GET /v0/task-artifacts`, `GET /v0/task-artifacts/{task_artifact_id}`, `POST /v0/task-artifacts/{task_artifact_id}/ingest`, `GET /v0/task-artifacts/{task_artifact_id}/chunks`, `POST /v0/tasks/{task_id}/artifact-chunks/retrieve`, `POST /v0/task-artifacts/{task_artifact_id}/chunks/retrieve`, `POST /v0/tasks/{task_id}/artifact-chunks/semantic-retrieval`, `POST /v0/task-artifacts/{task_artifact_id}/chunks/semantic-retrieval`, `POST /v0/task-artifact-chunk-embeddings`, `GET /v0/task-artifacts/{task_artifact_id}/chunk-embeddings`, `GET /v0/task-artifact-chunks/{task_artifact_chunk_id}/embeddings`, `GET /v0/task-artifact-chunk-embeddings/{task_artifact_chunk_embedding_id}`, `GET /v0/tasks/{task_id}/steps`, `GET /v0/task-steps/{task_step_id}`, `POST /v0/tasks/{task_id}/steps`, `POST /v0/task-steps/{task_step_id}/transition`, `POST /v0/execution-budgets`, `GET /v0/execution-budgets`, `GET /v0/execution-budgets/{execution_budget_id}`, `POST /v0/execution-budgets/{execution_budget_id}/deactivate`, `POST /v0/execution-budgets/{execution_budget_id}/supersede`, `GET /v0/tool-executions`, `GET /v0/tool-executions/{execution_id}` - `apps/web` and `workers` remain starter shells only. ### Data Foundation @@ -58,11 +58,11 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i ### Repo Boundaries In This Slice -- `apps/api`: implemented API, store, contracts, service logic, and migrations for continuity, tracing, memory, embeddings, entities, policies, tools, approvals, proxy execution, execution budgets, tasks, task steps, task workspaces, task artifacts, artifact-chunk embeddings, deterministic lexical artifact chunk retrieval, and narrow compile-path artifact chunk inclusion. +- `apps/api`: implemented API, store, contracts, service logic, and migrations for continuity, tracing, memory, embeddings, entities, policies, tools, approvals, proxy execution, execution budgets, tasks, task steps, task workspaces, task artifacts, artifact-chunk embeddings, deterministic lexical artifact chunk retrieval, deterministic semantic artifact chunk retrieval over durable embeddings, and narrow compile-path artifact chunk inclusion. - `apps/web`: minimal shell only; no shipped workflow UI. - `workers`: scaffold only; no background jobs or runner logic are implemented. - `infra`: local development bootstrap assets only. -- `tests`: unit and Postgres-backed integration coverage for the shipped seams above, including Sprint 4O task-step lineage/manual continuation, Sprint 4S step-linked execution synchronization, Sprint 5A task-workspace provisioning, Sprint 5C task-artifact registration, Sprint 5D local artifact ingestion plus chunk reads, Sprint 5E lexical artifact-chunk retrieval, Sprint 5F compile-path artifact chunk integration, and Sprint 5G artifact-chunk embedding persistence and reads. +- `tests`: unit and Postgres-backed integration coverage for the shipped seams above, including Sprint 4O task-step lineage/manual continuation, Sprint 4S step-linked execution synchronization, Sprint 5A task-workspace provisioning, Sprint 5C task-artifact registration, Sprint 5D local artifact ingestion plus chunk reads, Sprint 5E lexical artifact-chunk retrieval, Sprint 5F compile-path artifact chunk integration, Sprint 5G artifact-chunk embedding persistence and reads, and Sprint 5H semantic artifact-chunk retrieval. ## Core Flows Implemented Now @@ -76,6 +76,15 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i 6. Persist a `context.compile` trace plus explicit inclusion and exclusion events, including artifact chunk include/exclude decisions. 7. Return one deterministic `context_pack` describing scope, limits, selected context, artifact chunk results, and trace metadata. +### Artifact Chunk Retrieval + +1. Register and ingest visible local artifacts into durable `task_artifacts` and `task_artifact_chunks`. +2. Persist explicit artifact-chunk embeddings in `task_artifact_chunk_embeddings`, keyed to an existing visible embedding config. +3. Support deterministic lexical artifact-chunk retrieval for one visible task or one visible artifact. +4. Support deterministic semantic artifact-chunk retrieval for one visible task or one visible artifact, using a caller-supplied query vector plus explicit `embedding_config_id`. +5. Exclude artifacts whose `ingestion_status` is not `ingested`. +6. Keep compile-path artifact retrieval separate and lexical-only for now; semantic artifact retrieval remains a direct read seam outside compile. + ### Governed Memory And Retrieval 1. Accept explicit memory candidates through `POST /v0/memories/admit`. @@ -227,7 +236,7 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i ## Testing Coverage Implemented Now -- Unit and integration tests cover continuity, compiler, response generation, memory admission, review labels, review queue, embeddings, semantic retrieval, entities, policies, tools, approvals, proxy execution, execution budgets, and execution review. +- Unit and integration tests cover continuity, compiler, response generation, memory admission, review labels, review queue, embeddings, semantic retrieval, artifact semantic retrieval, entities, policies, tools, approvals, proxy execution, execution budgets, and execution review. - Sprint 4O, Sprint 4S, Sprint 5A, and Sprint 5C added explicit task lifecycle coverage: - migrations for `tasks`, `task_steps`, and task-step lineage - staged/backfilled migration coverage for `tool_executions.task_step_id` @@ -260,7 +269,7 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i The following areas remain planned later and must not be described as implemented: - runner-style orchestration and automatic multi-step progression beyond the current explicit manual continuation seam -- artifact chunk ranking beyond the current lexical match ordering, plus embeddings and semantic retrieval for artifact chunks +- hybrid lexical plus semantic artifact retrieval, compile-path semantic artifact use, and reranking beyond the current direct lexical and direct semantic ordering seams - rich document parsing beyond the current narrow UTF-8 text and markdown ingestion boundary - read-only Gmail and Calendar connectors - broader tool proxying and real-world side effects beyond the current no-I/O `proxy.echo` handler diff --git a/BUILD_REPORT.md b/BUILD_REPORT.md index fdcf29b..afabc87 100644 --- a/BUILD_REPORT.md +++ b/BUILD_REPORT.md @@ -2,166 +2,212 @@ ## sprint objective -Implement Sprint 5G: Artifact Chunk Embedding Substrate by adding durable, user-scoped `task_artifact_chunk_embeddings` records tied to existing `embedding_configs`, with strict vector validation, deterministic reads, and no semantic retrieval, compile-path semantic use, connector, runner, or UI changes. +Implement Sprint 5H: Semantic Artifact Chunk Retrieval Primitive by adding a deterministic, explicit-config semantic retrieval path over durable `task_artifact_chunk_embeddings`, scoped to one task or one artifact, without changing compile behavior or introducing hybrid retrieval, connectors, runners, or UI work. ## completed work -- Added Alembic revision `20260314_0025_task_artifact_chunk_embeddings` for a new `task_artifact_chunk_embeddings` table. -- Added schema for `task_artifact_chunk_embeddings`: - - columns: `id`, `user_id`, `task_artifact_chunk_id`, `embedding_config_id`, `dimensions`, `vector`, `created_at`, `updated_at` - - uniqueness: - - `UNIQUE (id, user_id)` - - `UNIQUE (user_id, task_artifact_chunk_id, embedding_config_id)` - - foreign keys: - - `(task_artifact_chunk_id, user_id) -> task_artifact_chunks(id, user_id)` - - `(embedding_config_id, user_id) -> embedding_configs(id, user_id)` - - checks: - - `dimensions > 0` - - `vector` is a JSON array - - `vector` is non-empty - - `jsonb_array_length(vector) = dimensions` - - index: - - `task_artifact_chunk_embeddings_user_chunk_created_idx (user_id, task_artifact_chunk_id, created_at, id)` - - security/runtime: - - owner-only RLS - - `GRANT SELECT, INSERT, UPDATE ON task_artifact_chunk_embeddings TO alicebot_app` -- Added stable contracts: - - `TaskArtifactChunkEmbeddingUpsertInput` - - `TaskArtifactChunkEmbeddingRecord` - - `TaskArtifactChunkEmbeddingWriteResponse` - - `TaskArtifactChunkEmbeddingDetailResponse` - - `TaskArtifactChunkEmbeddingListScope` - - `TaskArtifactChunkEmbeddingListSummary` - - `TaskArtifactChunkEmbeddingListResponse` - - `TASK_ARTIFACT_CHUNK_EMBEDDING_LIST_ORDER = ["task_artifact_chunk_sequence_no_asc", "created_at_asc", "id_asc"]` -- Implemented artifact-chunk embedding service behavior: - - validates `task_artifact_chunk_id` against visible `task_artifact_chunks` - - validates `embedding_config_id` against visible `embedding_configs` - - reuses the existing versioned `embedding_configs` seam without a second config/version model - - validates every vector element as finite numeric input - - enforces `len(vector) == embedding_config.dimensions` - - upserts one embedding per `(task_artifact_chunk_id, embedding_config_id)` pair - - exposes deterministic reads by: - - artifact scope - - chunk scope - - embedding id +- Added semantic artifact retrieval contracts: + - `TaskScopedSemanticArtifactChunkRetrievalInput` + - `task_id` + - `embedding_config_id` + - `query_vector` + - `limit` + - `ArtifactScopedSemanticArtifactChunkRetrievalInput` + - `task_artifact_id` + - `embedding_config_id` + - `query_vector` + - `limit` + - `TaskArtifactChunkSemanticRetrievalItem` + - `id` + - `task_id` + - `task_artifact_id` + - `relative_path` + - `media_type` + - `sequence_no` + - `char_start` + - `char_end_exclusive` + - `text` + - `score` + - `TaskArtifactChunkSemanticRetrievalSummary` + - `embedding_config_id` + - `query_vector_dimensions` + - `limit` + - `returned_count` + - `searched_artifact_count` + - `similarity_metric` + - `order` + - `scope` + - `TaskArtifactChunkSemanticRetrievalResponse` + - `TASK_ARTIFACT_CHUNK_SEMANTIC_RETRIEVAL_ORDER = ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"]` +- Implemented semantic artifact retrieval validation and service logic: + - validates that `embedding_config_id` resolves to a visible embedding config + - validates that every query-vector element is finite numeric input + - validates `len(query_vector) == embedding_config.dimensions` + - requires one explicit scope: + - task-scoped retrieval via visible `task_id` + - artifact-scoped retrieval via visible `task_artifact_id` + - excludes artifacts whose `ingestion_status` is not `ingested` + - preserves user isolation through the existing visible-row store lookups +- Added deterministic store queries over durable artifact embedding rows only: + - task scope joins: + - `task_artifact_chunk_embeddings` + - `task_artifact_chunks` + - `task_artifacts` + - artifact scope joins the same durable tables with a narrower artifact filter + - no compile-path semantic use was added + - no second embedding store was introduced - Added minimal API surface: - - `POST /v0/task-artifact-chunk-embeddings` - - `GET /v0/task-artifacts/{task_artifact_id}/chunk-embeddings` - - `GET /v0/task-artifact-chunks/{task_artifact_chunk_id}/embeddings` - - `GET /v0/task-artifact-chunk-embeddings/{task_artifact_chunk_embedding_id}` -- Added unit and integration coverage for: - - persistence - - deterministic ordering + - `POST /v0/tasks/{task_id}/artifact-chunks/semantic-retrieval` + - `POST /v0/task-artifacts/{task_artifact_id}/chunks/semantic-retrieval` +- Added tests for: - dimension validation - - invalid config and invalid chunk references - - cross-user isolation + - deterministic ordering and tie-breaking + - task-scoped retrieval + - artifact-scoped retrieval + - empty-result behavior + - exclusion of non-ingested artifacts + - per-user isolation - stable response shape - - migration presence, RLS, grants, and downgrade behavior -## embedding-config reuse rule and dimension-validation rule used +## similarity metric and ordering rule used -- Reuse rule: - - every artifact-chunk embedding must reference an existing visible `embedding_config` - - no new embedding versioning model was introduced -- Dimension-validation rule: - - vector normalization accepts only finite numeric values - - write requests fail unless `len(vector) == embedding_config.dimensions` - - database and service validation both enforce the dimensions rule +- Similarity metric: + - `cosine_similarity` + - computed in SQL as `1 - (embeddings.vector <=> query_vector)` via pgvector cosine distance +- Ordering rule: + - `score DESC` + - `relative_path ASC` + - `sequence_no ASC` + - `id ASC` +- Durable source restriction: + - retrieval reads only from persisted `task_artifact_chunk_embeddings`, `task_artifact_chunks`, and `task_artifacts` ## incomplete work -- None within Sprint 5G scope. +- None within Sprint 5H scope. ## files changed -- `apps/api/alembic/versions/20260314_0025_task_artifact_chunk_embeddings.py` - `apps/api/src/alicebot_api/contracts.py` -- `apps/api/src/alicebot_api/embedding.py` - `apps/api/src/alicebot_api/main.py` +- `apps/api/src/alicebot_api/semantic_retrieval.py` - `apps/api/src/alicebot_api/store.py` -- `tests/integration/test_migrations.py` -- `tests/integration/test_task_artifact_chunk_embeddings_api.py` -- `tests/unit/test_20260314_0025_task_artifact_chunk_embeddings.py` -- `tests/unit/test_task_artifact_chunk_embedding.py` -- `tests/unit/test_task_artifact_chunk_embedding_store.py` +- `tests/integration/test_semantic_artifact_chunk_retrieval_api.py` +- `tests/unit/test_artifacts_main.py` - `tests/unit/test_main.py` +- `tests/unit/test_semantic_retrieval.py` +- `tests/unit/test_task_artifact_chunk_embedding_store.py` - `BUILD_REPORT.md` -## example artifact-chunk embedding write response +## tests run + +- `./.venv/bin/python -m pytest tests/unit/test_semantic_retrieval.py tests/unit/test_task_artifact_chunk_embedding_store.py tests/unit/test_artifacts_main.py tests/unit/test_main.py` + - result: `65 passed in 0.55s` +- `./.venv/bin/python -m pytest tests/integration/test_semantic_artifact_chunk_retrieval_api.py` + - first sandboxed attempt failed because local Postgres access to `localhost:5432` was blocked by the sandbox +- `./.venv/bin/python -m pytest tests/integration/test_semantic_artifact_chunk_retrieval_api.py` + - result after allowing local Postgres access: `3 passed in 1.23s` +- `./.venv/bin/python -m pytest tests/unit` + - result: `377 passed in 0.59s` +- `./.venv/bin/python -m pytest tests/integration` + - result: `114 passed in 34.94s` + +## example task-scoped semantic retrieval response ```json { - "embedding": { - "id": "4d5d0a3b-6a8a-4bf4-bb7c-d1df3d6d84c8", - "task_artifact_id": "6dc8f07d-19f6-4667-b9f3-4573b9cf2b66", - "task_artifact_chunk_id": "fd3dc999-a4d3-4bb0-a287-4f4950dfd7e0", - "task_artifact_chunk_sequence_no": 2, - "embedding_config_id": "42dbab76-1e02-4b5f-a18b-f59c1b19d1d4", - "dimensions": 3, - "vector": [0.9, 0.8, 0.7], - "created_at": "2026-03-14T12:00:00+00:00", - "updated_at": "2026-03-14T12:10:00+00:00" - }, - "write_mode": "updated" + "items": [ + { + "id": "11111111-1111-1111-1111-111111111111", + "task_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "task_artifact_id": "22222222-2222-2222-2222-222222222222", + "relative_path": "docs/a.txt", + "media_type": "text/plain", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 9, + "text": "alpha doc", + "score": 1.0 + }, + { + "id": "33333333-3333-3333-3333-333333333333", + "task_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "task_artifact_id": "44444444-4444-4444-4444-444444444444", + "relative_path": "notes/b.md", + "media_type": "text/markdown", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 10, + "text": "alpha note", + "score": 1.0 + } + ], + "summary": { + "embedding_config_id": "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", + "query_vector_dimensions": 3, + "limit": 10, + "returned_count": 2, + "searched_artifact_count": 3, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": { + "kind": "task", + "task_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + } + } } ``` -## example artifact-chunk embedding list response +## example artifact-scoped semantic retrieval response ```json { "items": [ { - "id": "4d5d0a3b-6a8a-4bf4-bb7c-d1df3d6d84c8", - "task_artifact_id": "6dc8f07d-19f6-4667-b9f3-4573b9cf2b66", - "task_artifact_chunk_id": "fd3dc999-a4d3-4bb0-a287-4f4950dfd7e0", - "task_artifact_chunk_sequence_no": 2, - "embedding_config_id": "42dbab76-1e02-4b5f-a18b-f59c1b19d1d4", - "dimensions": 3, - "vector": [0.9, 0.8, 0.7], - "created_at": "2026-03-14T12:00:00+00:00", - "updated_at": "2026-03-14T12:10:00+00:00" + "id": "33333333-3333-3333-3333-333333333333", + "task_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "task_artifact_id": "44444444-4444-4444-4444-444444444444", + "relative_path": "notes/b.md", + "media_type": "text/markdown", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 10, + "text": "alpha note", + "score": 1.0 } ], "summary": { - "total_count": 1, - "order": ["task_artifact_chunk_sequence_no_asc", "created_at_asc", "id_asc"], + "embedding_config_id": "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", + "query_vector_dimensions": 3, + "limit": 10, + "returned_count": 1, + "searched_artifact_count": 1, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], "scope": { - "kind": "chunk", - "task_artifact_id": "6dc8f07d-19f6-4667-b9f3-4573b9cf2b66", - "task_artifact_chunk_id": "fd3dc999-a4d3-4bb0-a287-4f4950dfd7e0" + "kind": "artifact", + "task_id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "task_artifact_id": "44444444-4444-4444-4444-444444444444" } } } ``` -## tests run - -- `./.venv/bin/python -m pytest tests/unit/test_20260314_0025_task_artifact_chunk_embeddings.py tests/unit/test_task_artifact_chunk_embedding.py tests/unit/test_task_artifact_chunk_embedding_store.py tests/unit/test_main.py` - - result: `48 passed in 0.41s` -- `./.venv/bin/python -m pytest tests/integration/test_task_artifact_chunk_embeddings_api.py tests/integration/test_migrations.py` - - first sandboxed attempt failed to reach local Postgres on `localhost:5432` due sandbox restrictions -- `./.venv/bin/python -m pytest tests/integration/test_migrations.py::test_migrations_upgrade_and_downgrade tests/integration/test_task_artifact_chunk_embeddings_api.py` - - result: `4 passed in 1.99s` -- `./.venv/bin/python -m pytest tests/unit` - - result: `370 passed in 0.59s` -- `./.venv/bin/python -m pytest tests/integration` - - result: `111 passed in 34.92s` - ## blockers/issues -- No code blockers remained. -- Integration verification required elevated access to the local Postgres instance because sandboxed localhost connections were blocked. +- No code blocker remained after implementation. +- Integration verification required access to the local Postgres instance because sandboxed localhost TCP connections were blocked. ## what remains intentionally deferred to later milestones -- semantic retrieval over artifact chunks +- compile-path semantic artifact retrieval - lexical plus semantic hybrid artifact retrieval -- compile-path semantic use of artifact embeddings -- embedding generation via model or external API calls -- connectors, runners, orchestration, and UI work +- reranking beyond direct similarity ordering +- query embedding generation through a model or external API +- connectors +- runner orchestration +- UI work ## recommended next step -Use the new durable `task_artifact_chunk_embeddings` substrate to add a separate, narrowly scoped semantic artifact retrieval sprint that reads these stored vectors without changing compile-path behavior in the same step. +Adopt this new semantic artifact retrieval primitive in a follow-up sprint that explicitly decides how compile should consume semantic artifact chunks, without combining that change with hybrid retrieval or reranking in the same step. diff --git a/REVIEW_REPORT.md b/REVIEW_REPORT.md index acff7a4..ff292f5 100644 --- a/REVIEW_REPORT.md +++ b/REVIEW_REPORT.md @@ -6,16 +6,30 @@ PASS ## criteria met -- The sprint remains technically narrow and limited to the artifact-chunk embedding substrate: migration, contracts, store/service logic, and minimal embedding read/write routes are present in [20260314_0025_task_artifact_chunk_embeddings.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/alembic/versions/20260314_0025_task_artifact_chunk_embeddings.py), [embedding.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/embedding.py#L315), [main.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/main.py#L1968), and [store.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/store.py). -- Writes attach one validated vector to one visible `task_artifact_chunk` under one visible `embedding_config`, reject missing refs and dimension mismatches, and preserve user isolation through existing ownership seams. See [embedding.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/embedding.py#L323). -- Reads are deterministic and user-scoped. The migration enforces composite ownership-linked foreign keys and RLS, and list ordering is explicit in both contracts and queries. See [contracts.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/contracts.py), [store.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/store.py), and [20260314_0025_task_artifact_chunk_embeddings.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/alembic/versions/20260314_0025_task_artifact_chunk_embeddings.py#L15). -- Coverage remains adequate for the sprint packet: persistence, ordering, invalid refs, dimension validation, isolation, route shape, and migration upgrade/downgrade are test-backed. -- Prior runtime verification for this review cycle remains valid: - - `./.venv/bin/python -m pytest tests/unit` -> `370 passed` - - `./.venv/bin/python -m pytest tests/integration` -> `111 passed` -- The follow-up addressed the remaining review findings: - - [ARCHITECTURE.md](/Users/samirusani/Desktop/Codex/AliceBot/ARCHITECTURE.md) now reflects Sprint 5G as implemented, includes the new embedding routes and table, and no longer describes artifact-chunk embeddings as deferred. - - [RULES.md](/Users/samirusani/Desktop/Codex/AliceBot/RULES.md#L6) now makes [.ai/active/SPRINT_PACKET.md](/Users/samirusani/Desktop/Codex/AliceBot/.ai/active/SPRINT_PACKET.md) an immutable control/input artifact during implementation unless Control Tower changes the sprint. +- Sprint scope stayed narrow. The code changes remain limited to semantic artifact-chunk retrieval contracts, service logic, store queries, routes, tests, and the build report. +- Typed contracts for task-scoped and artifact-scoped semantic retrieval were added in `apps/api/src/alicebot_api/contracts.py`. +- The retrieval seam requires an explicit `embedding_config_id`, accepts a caller-supplied `query_vector`, validates finite numeric input, and rejects dimension mismatches in `apps/api/src/alicebot_api/semantic_retrieval.py`. +- Retrieval reads only from durable `task_artifact_chunk_embeddings`, `task_artifact_chunks`, and `task_artifacts`, with explicit task or artifact scope and deterministic ordering in `apps/api/src/alicebot_api/store.py`. +- Non-ingested artifacts are excluded from result rows in SQL and from `searched_artifact_count` summaries. +- Minimal API surface was added for both scopes in `apps/api/src/alicebot_api/main.py`: + - `POST /v0/tasks/{task_id}/artifact-chunks/semantic-retrieval` + - `POST /v0/task-artifacts/{task_artifact_id}/chunks/semantic-retrieval` +- Required test coverage is present: + - unit coverage for stable response shape, validation, task scope, artifact scope, and non-ingested behavior in `tests/unit/test_semantic_retrieval.py` + - route coverage in `tests/unit/test_artifacts_main.py` and `tests/unit/test_main.py` + - store-query coverage in `tests/unit/test_task_artifact_chunk_embedding_store.py` + - Postgres-backed integration coverage for deterministic ordering, scoping, empty results, exclusion rules, and per-user isolation in `tests/integration/test_semantic_artifact_chunk_retrieval_api.py` +- Verification already performed during review: + - `./.venv/bin/python -m pytest tests/unit` -> `377 passed in 0.58s` + - `./.venv/bin/python -m pytest tests/integration/test_semantic_artifact_chunk_retrieval_api.py` -> `3 passed in 1.36s` + - `./.venv/bin/python -m pytest tests/integration` -> `114 passed in 36.27s` +- `BUILD_REPORT.md` includes the new contracts, ordering rule, commands run, examples, and deferred scope. +- `ARCHITECTURE.md` now matches Sprint 5H: + - implemented slice updated to Sprint 5H + - semantic artifact-chunk retrieval described as shipped behavior + - semantic artifact retrieval endpoints listed in the runtime inventory + - repo/testing summaries extended through Sprint 5H + - deferred-scope language narrowed to compile-path semantic use, hybrid retrieval, and reranking ## criteria missed @@ -23,26 +37,26 @@ PASS ## quality issues -- No blocking implementation or documentation quality issues remain. +- No blocking implementation, test, or documentation issues remain. ## regression risks -- Low. The only follow-up changes in this pass were documentation and rules updates, and they do not affect runtime behavior. +- Low runtime risk. The feature is additive, isolated, and backed by full unit and integration suite passes. +- Low review risk on the follow-up because the only additional change after the prior review was documentation in `ARCHITECTURE.md`. ## docs issues -- None blocking. -- Provenance note: [.ai/active/SPRINT_PACKET.md](/Users/samirusani/Desktop/Codex/AliceBot/.ai/active/SPRINT_PACKET.md) is still modified in the worktree relative to the repo base, but the current contents match the Sprint 5G assignment being reviewed, and [RULES.md](/Users/samirusani/Desktop/Codex/AliceBot/RULES.md#L6) now codifies immutability for future implementation turns. +- None remaining for Sprint 5H. +- Note: no tests were rerun for the docs-only follow-up, which is appropriate because the code under review did not change after the previously verified green test runs. ## should anything be added to RULES.md? -- No. The needed control-artifact rule has been added. +- No. ## should anything update ARCHITECTURE.md? -- No. The needed Sprint 5G updates are present. +- No. The previously identified architecture drift has been corrected. ## recommended next action -1. Treat the sprint as review-passed. -2. If desired, keep the new `SPRINT_PACKET.md` immutability rule as the standing process guard for future sprints. +1. Treat Sprint 5H as review-passed. diff --git a/apps/api/src/alicebot_api/contracts.py b/apps/api/src/alicebot_api/contracts.py index c624578..07fa214 100644 --- a/apps/api/src/alicebot_api/contracts.py +++ b/apps/api/src/alicebot_api/contracts.py @@ -149,6 +149,12 @@ "sequence_no_asc", "id_asc", ] +TASK_ARTIFACT_CHUNK_SEMANTIC_RETRIEVAL_ORDER = [ + "score_desc", + "relative_path_asc", + "sequence_no_asc", + "id_asc", +] TASK_STEP_LIST_ORDER = ["sequence_no_asc", "created_at_asc", "id_asc"] TOOL_EXECUTION_LIST_ORDER = ["executed_at_asc", "id_asc"] EXECUTION_BUDGET_LIST_ORDER = ["created_at_asc", "id_asc"] @@ -1736,6 +1742,38 @@ class ArtifactScopedArtifactChunkRetrievalInput: query: str +@dataclass(frozen=True, slots=True) +class TaskScopedSemanticArtifactChunkRetrievalInput: + task_id: UUID + embedding_config_id: UUID + query_vector: tuple[float, ...] + limit: int = DEFAULT_ARTIFACT_CHUNK_RETRIEVAL_LIMIT + + def as_payload(self) -> JsonObject: + return { + "task_id": str(self.task_id), + "embedding_config_id": str(self.embedding_config_id), + "query_vector": [float(value) for value in self.query_vector], + "limit": self.limit, + } + + +@dataclass(frozen=True, slots=True) +class ArtifactScopedSemanticArtifactChunkRetrievalInput: + task_artifact_id: UUID + embedding_config_id: UUID + query_vector: tuple[float, ...] + limit: int = DEFAULT_ARTIFACT_CHUNK_RETRIEVAL_LIMIT + + def as_payload(self) -> JsonObject: + return { + "task_artifact_id": str(self.task_artifact_id), + "embedding_config_id": str(self.embedding_config_id), + "query_vector": [float(value) for value in self.query_vector], + "limit": self.limit, + } + + class TaskArtifactRecord(TypedDict): id: str task_id: str @@ -1873,6 +1911,35 @@ class TaskArtifactChunkRetrievalResponse(TypedDict): summary: TaskArtifactChunkRetrievalSummary +class TaskArtifactChunkSemanticRetrievalItem(TypedDict): + id: str + task_id: str + task_artifact_id: str + relative_path: str + media_type: str + sequence_no: int + char_start: int + char_end_exclusive: int + text: str + score: float + + +class TaskArtifactChunkSemanticRetrievalSummary(TypedDict): + embedding_config_id: str + query_vector_dimensions: int + limit: int + returned_count: int + searched_artifact_count: int + similarity_metric: Literal["cosine_similarity"] + order: list[str] + scope: TaskArtifactChunkRetrievalScope + + +class TaskArtifactChunkSemanticRetrievalResponse(TypedDict): + items: list[TaskArtifactChunkSemanticRetrievalItem] + summary: TaskArtifactChunkSemanticRetrievalSummary + + class TaskStepTraceLink(TypedDict): trace_id: str trace_kind: str diff --git a/apps/api/src/alicebot_api/main.py b/apps/api/src/alicebot_api/main.py index 0106917..fb487e6 100644 --- a/apps/api/src/alicebot_api/main.py +++ b/apps/api/src/alicebot_api/main.py @@ -15,6 +15,7 @@ ApprovalApproveInput, ApprovalRejectInput, ApprovalRequestCreateInput, + ArtifactScopedSemanticArtifactChunkRetrievalInput, CompileContextArtifactScopedArtifactRetrievalInput, CompileContextTaskScopedArtifactRetrievalInput, ConsentStatus, @@ -58,6 +59,7 @@ ProxyExecutionRequestInput, TaskArtifactIngestInput, TaskArtifactRegisterInput, + TaskScopedSemanticArtifactChunkRetrievalInput, TaskScopedArtifactChunkRetrievalInput, TaskStepKind, TaskStepLineageInput, @@ -197,8 +199,11 @@ route_tool_invocation, ) from alicebot_api.semantic_retrieval import ( + SemanticArtifactChunkRetrievalValidationError, SemanticMemoryRetrievalValidationError, + retrieve_artifact_scoped_semantic_artifact_chunk_records, retrieve_semantic_memory_records, + retrieve_task_scoped_semantic_artifact_chunk_records, ) from alicebot_api.response_generation import ( ResponseFailure, @@ -380,6 +385,17 @@ class RetrieveSemanticMemoriesRequest(BaseModel): ) +class RetrieveSemanticArtifactChunksRequest(BaseModel): + user_id: UUID + embedding_config_id: UUID + query_vector: list[float] = Field(min_length=1, max_length=20000) + limit: int = Field( + default=DEFAULT_ARTIFACT_CHUNK_RETRIEVAL_LIMIT, + ge=1, + le=MAX_ARTIFACT_CHUNK_RETRIEVAL_LIMIT, + ) + + class UpsertConsentRequest(BaseModel): user_id: UUID consent_key: str = Field(min_length=1, max_length=200) @@ -1447,6 +1463,66 @@ def retrieve_task_artifact_chunks_for_artifact( ) +@app.post("/v0/tasks/{task_id}/artifact-chunks/semantic-retrieval") +def retrieve_semantic_task_artifact_chunks( + task_id: UUID, + request: RetrieveSemanticArtifactChunksRequest, +) -> JSONResponse: + settings = get_settings() + + try: + with user_connection(settings.database_url, request.user_id) as conn: + payload = retrieve_task_scoped_semantic_artifact_chunk_records( + ContinuityStore(conn), + user_id=request.user_id, + request=TaskScopedSemanticArtifactChunkRetrievalInput( + task_id=task_id, + embedding_config_id=request.embedding_config_id, + query_vector=tuple(request.query_vector), + limit=request.limit, + ), + ) + except TaskNotFoundError as exc: + return JSONResponse(status_code=404, content={"detail": str(exc)}) + except SemanticArtifactChunkRetrievalValidationError as exc: + return JSONResponse(status_code=400, content={"detail": str(exc)}) + + return JSONResponse( + status_code=200, + content=jsonable_encoder(payload), + ) + + +@app.post("/v0/task-artifacts/{task_artifact_id}/chunks/semantic-retrieval") +def retrieve_semantic_artifact_chunks_for_artifact( + task_artifact_id: UUID, + request: RetrieveSemanticArtifactChunksRequest, +) -> JSONResponse: + settings = get_settings() + + try: + with user_connection(settings.database_url, request.user_id) as conn: + payload = retrieve_artifact_scoped_semantic_artifact_chunk_records( + ContinuityStore(conn), + user_id=request.user_id, + request=ArtifactScopedSemanticArtifactChunkRetrievalInput( + task_artifact_id=task_artifact_id, + embedding_config_id=request.embedding_config_id, + query_vector=tuple(request.query_vector), + limit=request.limit, + ), + ) + except TaskArtifactNotFoundError as exc: + return JSONResponse(status_code=404, content={"detail": str(exc)}) + except SemanticArtifactChunkRetrievalValidationError as exc: + return JSONResponse(status_code=400, content={"detail": str(exc)}) + + return JSONResponse( + status_code=200, + content=jsonable_encoder(payload), + ) + + @app.post("/v0/tasks/{task_id}/steps") def create_next_task_step(task_id: UUID, request: CreateNextTaskStepRequest) -> JSONResponse: settings = get_settings() diff --git a/apps/api/src/alicebot_api/semantic_retrieval.py b/apps/api/src/alicebot_api/semantic_retrieval.py index 5384e3d..4fe066d 100644 --- a/apps/api/src/alicebot_api/semantic_retrieval.py +++ b/apps/api/src/alicebot_api/semantic_retrieval.py @@ -1,25 +1,56 @@ from __future__ import annotations import math +from pathlib import Path +from typing import cast from uuid import UUID +from alicebot_api.artifacts import TaskArtifactNotFoundError from alicebot_api.contracts import ( SEMANTIC_MEMORY_RETRIEVAL_ORDER, + TASK_ARTIFACT_CHUNK_SEMANTIC_RETRIEVAL_ORDER, + ArtifactScopedSemanticArtifactChunkRetrievalInput, SemanticMemoryRetrievalRequestInput, SemanticMemoryRetrievalResponse, SemanticMemoryRetrievalResultItem, SemanticMemoryRetrievalSummary, + TaskArtifactChunkRetrievalScope, + TaskArtifactChunkRetrievalScopeKind, + TaskArtifactChunkSemanticRetrievalItem, + TaskArtifactChunkSemanticRetrievalResponse, + TaskArtifactChunkSemanticRetrievalSummary, + TaskScopedSemanticArtifactChunkRetrievalInput, ) -from alicebot_api.store import ContinuityStore, SemanticMemoryRetrievalRow +from alicebot_api.store import ( + ContinuityStore, + SemanticMemoryRetrievalRow, + TaskArtifactChunkSemanticRetrievalRow, +) +from alicebot_api.tasks import TaskNotFoundError + +SUPPORTED_TEXT_ARTIFACT_EXTENSIONS = { + ".txt": "text/plain", + ".text": "text/plain", + ".md": "text/markdown", + ".markdown": "text/markdown", +} class SemanticMemoryRetrievalValidationError(ValueError): """Raised when semantic memory retrieval fails explicit validation.""" -def _validate_query_vector(query_vector: tuple[float, ...]) -> list[float]: +class SemanticArtifactChunkRetrievalValidationError(ValueError): + """Raised when semantic artifact chunk retrieval fails explicit validation.""" + + +def _validate_query_vector( + query_vector: tuple[float, ...], + *, + error_type: type[ValueError], +) -> list[float]: if not query_vector: - raise SemanticMemoryRetrievalValidationError( + raise error_type( "query_vector must include at least one numeric value" ) @@ -27,7 +58,7 @@ def _validate_query_vector(query_vector: tuple[float, ...]) -> list[float]: for value in query_vector: normalized_value = float(value) if not math.isfinite(normalized_value): - raise SemanticMemoryRetrievalValidationError( + raise error_type( "query_vector must contain only finite numeric values" ) normalized.append(normalized_value) @@ -35,26 +66,41 @@ def _validate_query_vector(query_vector: tuple[float, ...]) -> list[float]: return normalized -def validate_semantic_memory_retrieval_request( +def _validate_embedding_config_and_query_vector( store: ContinuityStore, *, - request: SemanticMemoryRetrievalRequestInput, + embedding_config_id: UUID, + query_vector: tuple[float, ...], + error_type: type[ValueError], ) -> tuple[dict[str, object], list[float]]: - config = store.get_embedding_config_optional(request.embedding_config_id) + config = store.get_embedding_config_optional(embedding_config_id) if config is None: - raise SemanticMemoryRetrievalValidationError( + raise error_type( "embedding_config_id must reference an existing embedding config owned by the user: " - f"{request.embedding_config_id}" + f"{embedding_config_id}" ) - query_vector = _validate_query_vector(request.query_vector) - if len(query_vector) != config["dimensions"]: - raise SemanticMemoryRetrievalValidationError( + normalized_query_vector = _validate_query_vector(query_vector, error_type=error_type) + if len(normalized_query_vector) != config["dimensions"]: + raise error_type( "query_vector length must match embedding config dimensions " - f"({config['dimensions']}): {len(query_vector)}" + f"({config['dimensions']}): {len(normalized_query_vector)}" ) - return config, query_vector + return config, normalized_query_vector + + +def validate_semantic_memory_retrieval_request( + store: ContinuityStore, + *, + request: SemanticMemoryRetrievalRequestInput, +) -> tuple[dict[str, object], list[float]]: + return _validate_embedding_config_and_query_vector( + store, + embedding_config_id=request.embedding_config_id, + query_vector=request.query_vector, + error_type=SemanticMemoryRetrievalValidationError, + ) def serialize_semantic_memory_result_item( @@ -76,6 +122,175 @@ def serialize_semantic_memory_result_item( } +def _infer_media_type(*, relative_path: str, media_type_hint: str | None) -> str: + if media_type_hint is not None: + return media_type_hint + return SUPPORTED_TEXT_ARTIFACT_EXTENSIONS.get(Path(relative_path).suffix.lower(), "unknown") + + +def _build_task_artifact_chunk_retrieval_scope( + *, + kind: str, + task_id: UUID, + task_artifact_id: UUID | None = None, +) -> TaskArtifactChunkRetrievalScope: + scope: TaskArtifactChunkRetrievalScope = { + "kind": cast(TaskArtifactChunkRetrievalScopeKind, kind), + "task_id": str(task_id), + } + if task_artifact_id is not None: + scope["task_artifact_id"] = str(task_artifact_id) + return scope + + +def _serialize_semantic_artifact_chunk_result_item( + row: TaskArtifactChunkSemanticRetrievalRow, +) -> TaskArtifactChunkSemanticRetrievalItem: + return { + "id": str(row["id"]), + "task_id": str(row["task_id"]), + "task_artifact_id": str(row["task_artifact_id"]), + "relative_path": row["relative_path"], + "media_type": _infer_media_type( + relative_path=row["relative_path"], + media_type_hint=row["media_type_hint"], + ), + "sequence_no": row["sequence_no"], + "char_start": row["char_start"], + "char_end_exclusive": row["char_end_exclusive"], + "text": row["text"], + "score": float(row["score"]), + } + + +def validate_semantic_artifact_chunk_retrieval_request( + store: ContinuityStore, + *, + embedding_config_id: UUID, + query_vector: tuple[float, ...], +) -> tuple[dict[str, object], list[float]]: + return _validate_embedding_config_and_query_vector( + store, + embedding_config_id=embedding_config_id, + query_vector=query_vector, + error_type=SemanticArtifactChunkRetrievalValidationError, + ) + + +def _count_ingested_artifacts(artifact_rows: list[dict[str, object]]) -> int: + return sum(1 for artifact_row in artifact_rows if artifact_row["ingestion_status"] == "ingested") + + +def _build_semantic_artifact_chunk_summary( + *, + embedding_config_id: UUID, + query_vector_dimensions: int, + limit: int, + searched_artifact_count: int, + scope: TaskArtifactChunkRetrievalScope, + items: list[TaskArtifactChunkSemanticRetrievalItem], +) -> TaskArtifactChunkSemanticRetrievalSummary: + return { + "embedding_config_id": str(embedding_config_id), + "query_vector_dimensions": query_vector_dimensions, + "limit": limit, + "returned_count": len(items), + "searched_artifact_count": searched_artifact_count, + "similarity_metric": "cosine_similarity", + "order": list(TASK_ARTIFACT_CHUNK_SEMANTIC_RETRIEVAL_ORDER), + "scope": scope, + } + + +def retrieve_task_scoped_semantic_artifact_chunk_records( + store: ContinuityStore, + *, + user_id: UUID, + request: TaskScopedSemanticArtifactChunkRetrievalInput, +) -> TaskArtifactChunkSemanticRetrievalResponse: + del user_id + + task = store.get_task_optional(request.task_id) + if task is None: + raise TaskNotFoundError(f"task {request.task_id} was not found") + + _config, query_vector = validate_semantic_artifact_chunk_retrieval_request( + store, + embedding_config_id=request.embedding_config_id, + query_vector=request.query_vector, + ) + items = [ + _serialize_semantic_artifact_chunk_result_item(row) + for row in store.retrieve_task_scoped_semantic_artifact_chunk_matches( + task_id=request.task_id, + embedding_config_id=request.embedding_config_id, + query_vector=query_vector, + limit=request.limit, + ) + ] + artifact_rows = store.list_task_artifacts_for_task(request.task_id) + scope = _build_task_artifact_chunk_retrieval_scope( + kind="task", + task_id=request.task_id, + ) + return { + "items": items, + "summary": _build_semantic_artifact_chunk_summary( + embedding_config_id=request.embedding_config_id, + query_vector_dimensions=len(query_vector), + limit=request.limit, + searched_artifact_count=_count_ingested_artifacts(artifact_rows), + scope=scope, + items=items, + ), + } + + +def retrieve_artifact_scoped_semantic_artifact_chunk_records( + store: ContinuityStore, + *, + user_id: UUID, + request: ArtifactScopedSemanticArtifactChunkRetrievalInput, +) -> TaskArtifactChunkSemanticRetrievalResponse: + del user_id + + artifact_row = store.get_task_artifact_optional(request.task_artifact_id) + if artifact_row is None: + raise TaskArtifactNotFoundError(f"task artifact {request.task_artifact_id} was not found") + + _config, query_vector = validate_semantic_artifact_chunk_retrieval_request( + store, + embedding_config_id=request.embedding_config_id, + query_vector=request.query_vector, + ) + items = [ + _serialize_semantic_artifact_chunk_result_item(row) + for row in store.retrieve_artifact_scoped_semantic_artifact_chunk_matches( + task_artifact_id=request.task_artifact_id, + embedding_config_id=request.embedding_config_id, + query_vector=query_vector, + limit=request.limit, + ) + ] + scope = _build_task_artifact_chunk_retrieval_scope( + kind="artifact", + task_id=artifact_row["task_id"], + task_artifact_id=artifact_row["id"], + ) + searched_artifact_count = 1 if artifact_row["ingestion_status"] == "ingested" else 0 + return { + "items": items, + "summary": _build_semantic_artifact_chunk_summary( + embedding_config_id=request.embedding_config_id, + query_vector_dimensions=len(query_vector), + limit=request.limit, + searched_artifact_count=searched_artifact_count, + scope=scope, + items=items, + ), + } + + def retrieve_semantic_memory_records( store: ContinuityStore, *, diff --git a/apps/api/src/alicebot_api/store.py b/apps/api/src/alicebot_api/store.py index 206d168..c2f0771 100644 --- a/apps/api/src/alicebot_api/store.py +++ b/apps/api/src/alicebot_api/store.py @@ -283,6 +283,23 @@ class TaskArtifactChunkEmbeddingRow(TypedDict): updated_at: datetime +class TaskArtifactChunkSemanticRetrievalRow(TypedDict): + id: UUID + user_id: UUID + task_id: UUID + task_artifact_id: UUID + relative_path: str + media_type_hint: str | None + sequence_no: int + char_start: int + char_end_exclusive: int + text: str + created_at: datetime + updated_at: datetime + embedding_config_id: UUID + score: float + + class TaskStepRow(TypedDict): id: UUID user_id: UUID @@ -806,6 +823,72 @@ class LabelCountRow(TypedDict): LIMIT %s """ +RETRIEVE_TASK_SCOPED_SEMANTIC_ARTIFACT_CHUNK_MATCHES_SQL = """ + SELECT + chunks.id, + chunks.user_id, + artifacts.task_id, + artifacts.id AS task_artifact_id, + artifacts.relative_path, + artifacts.media_type_hint, + chunks.sequence_no, + chunks.char_start, + chunks.char_end_exclusive, + chunks.text, + chunks.created_at, + chunks.updated_at, + embeddings.embedding_config_id, + 1 - ( + replace(embeddings.vector::text, ' ', '')::vector <=> %s::vector + ) AS score + FROM task_artifact_chunk_embeddings AS embeddings + JOIN task_artifact_chunks AS chunks + ON chunks.id = embeddings.task_artifact_chunk_id + AND chunks.user_id = embeddings.user_id + JOIN task_artifacts AS artifacts + ON artifacts.id = chunks.task_artifact_id + AND artifacts.user_id = chunks.user_id + WHERE embeddings.embedding_config_id = %s + AND embeddings.dimensions = %s + AND artifacts.task_id = %s + AND artifacts.ingestion_status = 'ingested' + ORDER BY score DESC, artifacts.relative_path ASC, chunks.sequence_no ASC, chunks.id ASC + LIMIT %s + """ + +RETRIEVE_ARTIFACT_SCOPED_SEMANTIC_ARTIFACT_CHUNK_MATCHES_SQL = """ + SELECT + chunks.id, + chunks.user_id, + artifacts.task_id, + artifacts.id AS task_artifact_id, + artifacts.relative_path, + artifacts.media_type_hint, + chunks.sequence_no, + chunks.char_start, + chunks.char_end_exclusive, + chunks.text, + chunks.created_at, + chunks.updated_at, + embeddings.embedding_config_id, + 1 - ( + replace(embeddings.vector::text, ' ', '')::vector <=> %s::vector + ) AS score + FROM task_artifact_chunk_embeddings AS embeddings + JOIN task_artifact_chunks AS chunks + ON chunks.id = embeddings.task_artifact_chunk_id + AND chunks.user_id = embeddings.user_id + JOIN task_artifacts AS artifacts + ON artifacts.id = chunks.task_artifact_id + AND artifacts.user_id = chunks.user_id + WHERE embeddings.embedding_config_id = %s + AND embeddings.dimensions = %s + AND artifacts.id = %s + AND artifacts.ingestion_status = 'ingested' + ORDER BY score DESC, artifacts.relative_path ASC, chunks.sequence_no ASC, chunks.id ASC + LIMIT %s + """ + INSERT_ENTITY_SQL = """ INSERT INTO entities (user_id, entity_type, name, source_memory_ids, created_at) VALUES (app.current_user_id(), %s, %s, %s, clock_timestamp()) @@ -2579,6 +2662,44 @@ def retrieve_semantic_memory_matches( ), ) + def retrieve_task_scoped_semantic_artifact_chunk_matches( + self, + *, + task_id: UUID, + embedding_config_id: UUID, + query_vector: list[float], + limit: int, + ) -> list[TaskArtifactChunkSemanticRetrievalRow]: + return self._fetch_all( + RETRIEVE_TASK_SCOPED_SEMANTIC_ARTIFACT_CHUNK_MATCHES_SQL, + ( + self._vector_literal(query_vector), + embedding_config_id, + len(query_vector), + task_id, + limit, + ), + ) + + def retrieve_artifact_scoped_semantic_artifact_chunk_matches( + self, + *, + task_artifact_id: UUID, + embedding_config_id: UUID, + query_vector: list[float], + limit: int, + ) -> list[TaskArtifactChunkSemanticRetrievalRow]: + return self._fetch_all( + RETRIEVE_ARTIFACT_SCOPED_SEMANTIC_ARTIFACT_CHUNK_MATCHES_SQL, + ( + self._vector_literal(query_vector), + embedding_config_id, + len(query_vector), + task_artifact_id, + limit, + ), + ) + def create_entity( self, *, diff --git a/tests/integration/test_semantic_artifact_chunk_retrieval_api.py b/tests/integration/test_semantic_artifact_chunk_retrieval_api.py new file mode 100644 index 0000000..7ee8cbd --- /dev/null +++ b/tests/integration/test_semantic_artifact_chunk_retrieval_api.py @@ -0,0 +1,569 @@ +from __future__ import annotations + +import json +from typing import Any +from urllib.parse import urlencode +from uuid import UUID, uuid4 + +import anyio +import pytest + +import apps.api.src.alicebot_api.main as main_module +from apps.api.src.alicebot_api.config import Settings +from alicebot_api.db import user_connection +from alicebot_api.store import ContinuityStore + + +def invoke_request( + method: str, + path: str, + *, + query_params: dict[str, str] | None = None, + payload: dict[str, Any] | None = None, +) -> tuple[int, dict[str, Any]]: + messages: list[dict[str, object]] = [] + encoded_body = b"" if payload is None else json.dumps(payload).encode() + request_received = False + + async def receive() -> dict[str, object]: + nonlocal request_received + if request_received: + return {"type": "http.disconnect"} + + request_received = True + return {"type": "http.request", "body": encoded_body, "more_body": False} + + async def send(message: dict[str, object]) -> None: + messages.append(message) + + query_string = urlencode(query_params or {}).encode() + scope = { + "type": "http", + "asgi": {"version": "3.0"}, + "http_version": "1.1", + "method": method, + "scheme": "http", + "path": path, + "raw_path": path.encode(), + "query_string": query_string, + "headers": [(b"content-type", b"application/json")], + "client": ("testclient", 50000), + "server": ("testserver", 80), + "root_path": "", + } + + anyio.run(main_module.app, scope, receive, send) + + start_message = next(message for message in messages if message["type"] == "http.response.start") + body = b"".join( + message.get("body", b"") + for message in messages + if message["type"] == "http.response.body" + ) + return start_message["status"], json.loads(body) + + +def seed_task_with_workspace(database_url: str, *, email: str) -> dict[str, UUID]: + user_id = uuid4() + + with user_connection(database_url, user_id) as conn: + store = ContinuityStore(conn) + store.create_user(user_id, email, email.split("@", 1)[0].title()) + thread = store.create_thread("Semantic artifact retrieval thread") + tool = store.create_tool( + tool_key="proxy.echo", + name="Proxy Echo", + description="Deterministic proxy handler.", + version="1.0.0", + metadata_version="tool_metadata_v0", + active=True, + tags=["proxy"], + action_hints=["tool.run"], + scope_hints=["workspace"], + domain_hints=[], + risk_hints=[], + metadata={"transport": "proxy"}, + ) + task = store.create_task( + thread_id=thread["id"], + tool_id=tool["id"], + status="approved", + request={ + "thread_id": str(thread["id"]), + "tool_id": str(tool["id"]), + "action": "tool.run", + "scope": "workspace", + "domain_hint": None, + "risk_hint": None, + "attributes": {}, + }, + tool={ + "id": str(tool["id"]), + "tool_key": "proxy.echo", + "name": "Proxy Echo", + "description": "Deterministic proxy handler.", + "version": "1.0.0", + "metadata_version": "tool_metadata_v0", + "active": True, + "tags": ["proxy"], + "action_hints": ["tool.run"], + "scope_hints": ["workspace"], + "domain_hints": [], + "risk_hints": [], + "metadata": {"transport": "proxy"}, + "created_at": tool["created_at"].isoformat(), + }, + latest_approval_id=None, + latest_execution_id=None, + ) + workspace = store.create_task_workspace( + task_id=task["id"], + status="active", + local_path=f"/tmp/task-workspaces/{user_id}/{task['id']}", + ) + + return { + "user_id": user_id, + "task_id": task["id"], + "task_workspace_id": workspace["id"], + } + + +def seed_embedding_config( + database_url: str, + *, + user_id: UUID, + provider: str, + model: str, + version: str, + dimensions: int, +) -> UUID: + with user_connection(database_url, user_id) as conn: + created = ContinuityStore(conn).create_embedding_config( + provider=provider, + model=model, + version=version, + dimensions=dimensions, + status="active", + metadata={"task": "semantic_artifact_chunk_retrieval"}, + ) + return created["id"] + + +def create_artifact_with_chunk_embeddings( + database_url: str, + *, + user_id: UUID, + task_id: UUID, + task_workspace_id: UUID, + embedding_config_id: UUID | None, + relative_path: str, + chunks: list[tuple[str, list[float] | None]], + ingestion_status: str = "ingested", + media_type_hint: str | None = "text/plain", +) -> dict[str, object]: + with user_connection(database_url, user_id) as conn: + store = ContinuityStore(conn) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status=ingestion_status, + relative_path=relative_path, + media_type_hint=media_type_hint, + ) + created_chunks: list[dict[str, object]] = [] + char_start = 0 + for sequence_no, (text, vector) in enumerate(chunks, start=1): + chunk = store.create_task_artifact_chunk( + task_artifact_id=artifact["id"], + sequence_no=sequence_no, + char_start=char_start, + char_end_exclusive=char_start + len(text), + text=text, + ) + char_start += len(text) + created_chunks.append(chunk) + if embedding_config_id is not None and vector is not None: + store.create_task_artifact_chunk_embedding( + task_artifact_chunk_id=chunk["id"], + embedding_config_id=embedding_config_id, + dimensions=len(vector), + vector=vector, + ) + + return { + "artifact_id": artifact["id"], + "chunk_ids": [chunk["id"] for chunk in created_chunks], + } + + +def test_semantic_artifact_chunk_retrieval_endpoints_return_deterministic_task_and_artifact_results( + migrated_database_urls, + monkeypatch, +) -> None: + owner = seed_task_with_workspace(migrated_database_urls["app"], email="owner@example.com") + config_id = seed_embedding_config( + migrated_database_urls["app"], + user_id=owner["user_id"], + provider="openai", + model="text-embedding-3-large", + version="2026-03-15", + dimensions=3, + ) + docs = create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=owner["user_id"], + task_id=owner["task_id"], + task_workspace_id=owner["task_workspace_id"], + embedding_config_id=config_id, + relative_path="docs/a.txt", + chunks=[("alpha doc", [1.0, 0.0, 0.0])], + media_type_hint="text/plain", + ) + notes = create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=owner["user_id"], + task_id=owner["task_id"], + task_workspace_id=owner["task_workspace_id"], + embedding_config_id=config_id, + relative_path="notes/b.md", + chunks=[("alpha note", [1.0, 0.0, 0.0])], + media_type_hint="text/markdown", + ) + weak = create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=owner["user_id"], + task_id=owner["task_id"], + task_workspace_id=owner["task_workspace_id"], + embedding_config_id=config_id, + relative_path="notes/c.txt", + chunks=[("beta weak", [0.0, 1.0, 0.0])], + media_type_hint="text/plain", + ) + pending = create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=owner["user_id"], + task_id=owner["task_id"], + task_workspace_id=owner["task_workspace_id"], + embedding_config_id=config_id, + relative_path="notes/pending.txt", + chunks=[("hidden pending", [1.0, 0.0, 0.0])], + ingestion_status="pending", + media_type_hint="text/plain", + ) + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings(database_url=migrated_database_urls["app"]), + ) + + task_status, task_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(owner["user_id"]), + "embedding_config_id": str(config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 10, + }, + ) + artifact_status, artifact_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{notes['artifact_id']}/chunks/semantic-retrieval", + payload={ + "user_id": str(owner["user_id"]), + "embedding_config_id": str(config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 10, + }, + ) + + assert task_status == 200 + assert task_payload["summary"] == { + "embedding_config_id": str(config_id), + "query_vector_dimensions": 3, + "limit": 10, + "returned_count": 3, + "searched_artifact_count": 3, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": {"kind": "task", "task_id": str(owner["task_id"])}, + } + assert [item["id"] for item in task_payload["items"]] == [ + str(docs["chunk_ids"][0]), + str(notes["chunk_ids"][0]), + str(weak["chunk_ids"][0]), + ] + assert str(pending["chunk_ids"][0]) not in {item["id"] for item in task_payload["items"]} + assert task_payload["items"][0]["relative_path"] == "docs/a.txt" + assert task_payload["items"][1]["relative_path"] == "notes/b.md" + assert task_payload["items"][0]["score"] == pytest.approx(1.0) + assert task_payload["items"][1]["score"] == pytest.approx(1.0) + assert task_payload["items"][2]["score"] == pytest.approx(0.0) + assert set(task_payload["items"][0]) == { + "id", + "task_id", + "task_artifact_id", + "relative_path", + "media_type", + "sequence_no", + "char_start", + "char_end_exclusive", + "text", + "score", + } + + assert artifact_status == 200 + assert artifact_payload["summary"] == { + "embedding_config_id": str(config_id), + "query_vector_dimensions": 3, + "limit": 10, + "returned_count": 1, + "searched_artifact_count": 1, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": { + "kind": "artifact", + "task_id": str(owner["task_id"]), + "task_artifact_id": str(notes["artifact_id"]), + }, + } + assert artifact_payload["items"] == [ + { + "id": str(notes["chunk_ids"][0]), + "task_id": str(owner["task_id"]), + "task_artifact_id": str(notes["artifact_id"]), + "relative_path": "notes/b.md", + "media_type": "text/markdown", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": len("alpha note"), + "text": "alpha note", + "score": artifact_payload["items"][0]["score"], + } + ] + assert artifact_payload["items"][0]["score"] == pytest.approx(1.0) + + +def test_semantic_artifact_chunk_retrieval_rejects_invalid_config_dimension_mismatch_and_cross_user_scope( + migrated_database_urls, + monkeypatch, +) -> None: + owner = seed_task_with_workspace(migrated_database_urls["app"], email="owner@example.com") + intruder = seed_task_with_workspace(migrated_database_urls["app"], email="intruder@example.com") + owner_config_id = seed_embedding_config( + migrated_database_urls["app"], + user_id=owner["user_id"], + provider="openai", + model="text-embedding-3-large", + version="2026-03-15", + dimensions=3, + ) + intruder_config_id = seed_embedding_config( + migrated_database_urls["app"], + user_id=intruder["user_id"], + provider="openai", + model="text-embedding-3-large", + version="2026-03-15", + dimensions=3, + ) + owner_artifact = create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=owner["user_id"], + task_id=owner["task_id"], + task_workspace_id=owner["task_workspace_id"], + embedding_config_id=owner_config_id, + relative_path="docs/spec.txt", + chunks=[("owner chunk", [1.0, 0.0, 0.0])], + ) + create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=intruder["user_id"], + task_id=intruder["task_id"], + task_workspace_id=intruder["task_workspace_id"], + embedding_config_id=intruder_config_id, + relative_path="docs/intruder.txt", + chunks=[("intruder chunk", [1.0, 0.0, 0.0])], + ) + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings(database_url=migrated_database_urls["app"]), + ) + + missing_status, missing_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(owner["user_id"]), + "embedding_config_id": str(uuid4()), + "query_vector": [1.0, 0.0, 0.0], + "limit": 5, + }, + ) + mismatch_status, mismatch_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(owner["user_id"]), + "embedding_config_id": str(owner_config_id), + "query_vector": [1.0, 0.0], + "limit": 5, + }, + ) + cross_user_task_status, cross_user_task_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(intruder["user_id"]), + "embedding_config_id": str(intruder_config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 5, + }, + ) + cross_user_artifact_status, cross_user_artifact_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{owner_artifact['artifact_id']}/chunks/semantic-retrieval", + payload={ + "user_id": str(intruder["user_id"]), + "embedding_config_id": str(intruder_config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 5, + }, + ) + cross_user_config_status, cross_user_config_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(owner["user_id"]), + "embedding_config_id": str(intruder_config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 5, + }, + ) + + assert missing_status == 400 + assert missing_payload["detail"].startswith( + "embedding_config_id must reference an existing embedding config owned by the user" + ) + assert mismatch_status == 400 + assert mismatch_payload["detail"] == "query_vector length must match embedding config dimensions (3): 2" + assert cross_user_task_status == 404 + assert cross_user_task_payload == {"detail": f"task {owner['task_id']} was not found"} + assert cross_user_artifact_status == 404 + assert cross_user_artifact_payload == { + "detail": f"task artifact {owner_artifact['artifact_id']} was not found" + } + assert cross_user_config_status == 400 + assert cross_user_config_payload["detail"] == ( + "embedding_config_id must reference an existing embedding config owned by the user: " + f"{intruder_config_id}" + ) + + +def test_semantic_artifact_chunk_retrieval_supports_empty_results_and_per_user_isolation( + migrated_database_urls, + monkeypatch, +) -> None: + owner = seed_task_with_workspace(migrated_database_urls["app"], email="owner@example.com") + intruder = seed_task_with_workspace(migrated_database_urls["app"], email="intruder@example.com") + owner_config_id = seed_embedding_config( + migrated_database_urls["app"], + user_id=owner["user_id"], + provider="openai", + model="text-embedding-3-large", + version="2026-03-15", + dimensions=3, + ) + owner_empty_config_id = seed_embedding_config( + migrated_database_urls["app"], + user_id=owner["user_id"], + provider="openai", + model="text-embedding-3-small", + version="2026-03-15", + dimensions=3, + ) + intruder_config_id = seed_embedding_config( + migrated_database_urls["app"], + user_id=intruder["user_id"], + provider="openai", + model="text-embedding-3-large", + version="2026-03-15", + dimensions=3, + ) + owner_artifact = create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=owner["user_id"], + task_id=owner["task_id"], + task_workspace_id=owner["task_workspace_id"], + embedding_config_id=owner_config_id, + relative_path="docs/owner.txt", + chunks=[("owner semantic", [1.0, 0.0, 0.0])], + ) + intruder_artifact = create_artifact_with_chunk_embeddings( + migrated_database_urls["app"], + user_id=intruder["user_id"], + task_id=intruder["task_id"], + task_workspace_id=intruder["task_workspace_id"], + embedding_config_id=intruder_config_id, + relative_path="docs/intruder.txt", + chunks=[("intruder semantic", [1.0, 0.0, 0.0])], + ) + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings(database_url=migrated_database_urls["app"]), + ) + + owner_status, owner_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(owner["user_id"]), + "embedding_config_id": str(owner_config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 5, + }, + ) + intruder_status, intruder_payload = invoke_request( + "POST", + f"/v0/tasks/{intruder['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(intruder["user_id"]), + "embedding_config_id": str(intruder_config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 5, + }, + ) + empty_status, empty_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/artifact-chunks/semantic-retrieval", + payload={ + "user_id": str(owner["user_id"]), + "embedding_config_id": str(owner_empty_config_id), + "query_vector": [1.0, 0.0, 0.0], + "limit": 5, + }, + ) + + assert owner_status == 200 + assert [item["id"] for item in owner_payload["items"]] == [str(owner_artifact["chunk_ids"][0])] + assert intruder_status == 200 + assert [item["id"] for item in intruder_payload["items"]] == [ + str(intruder_artifact["chunk_ids"][0]) + ] + assert empty_status == 200 + assert empty_payload == { + "items": [], + "summary": { + "embedding_config_id": str(owner_empty_config_id), + "query_vector_dimensions": 3, + "limit": 5, + "returned_count": 0, + "searched_artifact_count": 1, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": {"kind": "task", "task_id": str(owner["task_id"])}, + }, + } diff --git a/tests/unit/test_artifacts_main.py b/tests/unit/test_artifacts_main.py index a009b60..634d9b6 100644 --- a/tests/unit/test_artifacts_main.py +++ b/tests/unit/test_artifacts_main.py @@ -12,6 +12,7 @@ TaskArtifactNotFoundError, TaskArtifactValidationError, ) +from alicebot_api.semantic_retrieval import SemanticArtifactChunkRetrievalValidationError from alicebot_api.tasks import TaskNotFoundError from alicebot_api.workspaces import TaskWorkspaceNotFoundError @@ -229,6 +230,141 @@ def fake_retrieve_task_scoped_artifact_chunk_records(*_args, **_kwargs): } +def test_retrieve_semantic_task_artifact_chunks_endpoint_returns_payload(monkeypatch) -> None: + user_id = uuid4() + task_id = uuid4() + config_id = uuid4() + settings = Settings(database_url="postgresql://app") + + @contextmanager + def fake_user_connection(*_args, **_kwargs): + yield object() + + monkeypatch.setattr(main_module, "get_settings", lambda: settings) + monkeypatch.setattr(main_module, "user_connection", fake_user_connection) + monkeypatch.setattr( + main_module, + "retrieve_task_scoped_semantic_artifact_chunk_records", + lambda *_args, **_kwargs: { + "items": [], + "summary": { + "embedding_config_id": str(config_id), + "query_vector_dimensions": 3, + "limit": 5, + "returned_count": 0, + "searched_artifact_count": 1, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": {"kind": "task", "task_id": str(task_id)}, + }, + }, + ) + + response = main_module.retrieve_semantic_task_artifact_chunks( + task_id, + main_module.RetrieveSemanticArtifactChunksRequest( + user_id=user_id, + embedding_config_id=config_id, + query_vector=[1.0, 0.0, 0.0], + limit=5, + ), + ) + + assert response.status_code == 200 + assert json.loads(response.body) == { + "items": [], + "summary": { + "embedding_config_id": str(config_id), + "query_vector_dimensions": 3, + "limit": 5, + "returned_count": 0, + "searched_artifact_count": 1, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": {"kind": "task", "task_id": str(task_id)}, + }, + } + + +def test_retrieve_semantic_task_artifact_chunks_endpoint_maps_validation_to_400(monkeypatch) -> None: + user_id = uuid4() + task_id = uuid4() + config_id = uuid4() + settings = Settings(database_url="postgresql://app") + + @contextmanager + def fake_user_connection(*_args, **_kwargs): + yield object() + + def fake_retrieve_task_scoped_semantic_artifact_chunk_records(*_args, **_kwargs): + raise SemanticArtifactChunkRetrievalValidationError( + f"embedding_config_id must reference an existing embedding config owned by the user: {config_id}" + ) + + monkeypatch.setattr(main_module, "get_settings", lambda: settings) + monkeypatch.setattr(main_module, "user_connection", fake_user_connection) + monkeypatch.setattr( + main_module, + "retrieve_task_scoped_semantic_artifact_chunk_records", + fake_retrieve_task_scoped_semantic_artifact_chunk_records, + ) + + response = main_module.retrieve_semantic_task_artifact_chunks( + task_id, + main_module.RetrieveSemanticArtifactChunksRequest( + user_id=user_id, + embedding_config_id=config_id, + query_vector=[1.0, 0.0, 0.0], + limit=5, + ), + ) + + assert response.status_code == 400 + assert json.loads(response.body) == { + "detail": ( + "embedding_config_id must reference an existing embedding config owned by the user: " + f"{config_id}" + ) + } + + +def test_retrieve_semantic_artifact_chunk_endpoint_maps_not_found_to_404(monkeypatch) -> None: + user_id = uuid4() + task_artifact_id = uuid4() + config_id = uuid4() + settings = Settings(database_url="postgresql://app") + + @contextmanager + def fake_user_connection(*_args, **_kwargs): + yield object() + + def fake_retrieve_artifact_scoped_semantic_artifact_chunk_records(*_args, **_kwargs): + raise TaskArtifactNotFoundError(f"task artifact {task_artifact_id} was not found") + + monkeypatch.setattr(main_module, "get_settings", lambda: settings) + monkeypatch.setattr(main_module, "user_connection", fake_user_connection) + monkeypatch.setattr( + main_module, + "retrieve_artifact_scoped_semantic_artifact_chunk_records", + fake_retrieve_artifact_scoped_semantic_artifact_chunk_records, + ) + + response = main_module.retrieve_semantic_artifact_chunks_for_artifact( + task_artifact_id, + main_module.RetrieveSemanticArtifactChunksRequest( + user_id=user_id, + embedding_config_id=config_id, + query_vector=[1.0, 0.0, 0.0], + limit=5, + ), + ) + + assert response.status_code == 404 + assert json.loads(response.body) == { + "detail": f"task artifact {task_artifact_id} was not found" + } + + def test_retrieve_artifact_chunk_endpoint_maps_not_found_to_404(monkeypatch) -> None: user_id = uuid4() task_artifact_id = uuid4() diff --git a/tests/unit/test_main.py b/tests/unit/test_main.py index 9c7a926..0b3441d 100644 --- a/tests/unit/test_main.py +++ b/tests/unit/test_main.py @@ -139,6 +139,8 @@ def test_healthcheck_route_is_registered() -> None: assert "/v0/task-artifacts/{task_artifact_id}" in route_paths assert "/v0/task-artifacts/{task_artifact_id}/ingest" in route_paths assert "/v0/task-artifacts/{task_artifact_id}/chunks" in route_paths + assert "/v0/tasks/{task_id}/artifact-chunks/semantic-retrieval" in route_paths + assert "/v0/task-artifacts/{task_artifact_id}/chunks/semantic-retrieval" in route_paths assert "/v0/task-steps/{task_step_id}" in route_paths assert "/v0/task-steps/{task_step_id}/transition" in route_paths assert "/v0/entities/{entity_id}" in route_paths diff --git a/tests/unit/test_semantic_retrieval.py b/tests/unit/test_semantic_retrieval.py index 780b4e4..7f3b26f 100644 --- a/tests/unit/test_semantic_retrieval.py +++ b/tests/unit/test_semantic_retrieval.py @@ -5,11 +5,19 @@ import pytest -from alicebot_api.contracts import SemanticMemoryRetrievalRequestInput +from alicebot_api.contracts import ( + ArtifactScopedSemanticArtifactChunkRetrievalInput, + SemanticMemoryRetrievalRequestInput, + TaskScopedSemanticArtifactChunkRetrievalInput, +) from alicebot_api.semantic_retrieval import ( + SemanticArtifactChunkRetrievalValidationError, SemanticMemoryRetrievalValidationError, + retrieve_artifact_scoped_semantic_artifact_chunk_records, retrieve_semantic_memory_records, + retrieve_task_scoped_semantic_artifact_chunk_records, ) +from alicebot_api.tasks import TaskNotFoundError class SemanticRetrievalStoreStub: @@ -17,6 +25,10 @@ def __init__(self) -> None: self.base_time = datetime(2026, 3, 12, 9, 0, tzinfo=UTC) self.config_by_id: dict[UUID, dict[str, object]] = {} self.retrieval_rows: list[dict[str, object]] = [] + self.task_artifact_retrieval_rows: list[dict[str, object]] = [] + self.tasks: dict[UUID, dict[str, object]] = {} + self.artifacts_by_id: dict[UUID, dict[str, object]] = {} + self.artifacts_by_task_id: dict[UUID, list[dict[str, object]]] = {} self.last_query: dict[str, object] | None = None def get_embedding_config_optional(self, embedding_config_id: UUID) -> dict[str, object] | None: @@ -36,6 +48,49 @@ def retrieve_semantic_memory_matches( } return list(self.retrieval_rows[:limit]) + def get_task_optional(self, task_id: UUID) -> dict[str, object] | None: + return self.tasks.get(task_id) + + def get_task_artifact_optional(self, task_artifact_id: UUID) -> dict[str, object] | None: + return self.artifacts_by_id.get(task_artifact_id) + + def list_task_artifacts_for_task(self, task_id: UUID) -> list[dict[str, object]]: + return list(self.artifacts_by_task_id.get(task_id, [])) + + def retrieve_task_scoped_semantic_artifact_chunk_matches( + self, + *, + task_id: UUID, + embedding_config_id: UUID, + query_vector: list[float], + limit: int, + ) -> list[dict[str, object]]: + self.last_query = { + "scope": "task", + "task_id": task_id, + "embedding_config_id": embedding_config_id, + "query_vector": query_vector, + "limit": limit, + } + return list(self.task_artifact_retrieval_rows[:limit]) + + def retrieve_artifact_scoped_semantic_artifact_chunk_matches( + self, + *, + task_artifact_id: UUID, + embedding_config_id: UUID, + query_vector: list[float], + limit: int, + ) -> list[dict[str, object]]: + self.last_query = { + "scope": "artifact", + "task_artifact_id": task_artifact_id, + "embedding_config_id": embedding_config_id, + "query_vector": query_vector, + "limit": limit, + } + return list(self.task_artifact_retrieval_rows[:limit]) + def seed_config(store: SemanticRetrievalStoreStub, *, dimensions: int = 3) -> UUID: config_id = uuid4() @@ -67,6 +122,60 @@ def active_row( } +def seed_task(store: SemanticRetrievalStoreStub) -> UUID: + task_id = uuid4() + store.tasks[task_id] = {"id": task_id} + return task_id + + +def seed_artifact( + store: SemanticRetrievalStoreStub, + *, + task_id: UUID, + ingestion_status: str = "ingested", + relative_path: str = "docs/spec.txt", + media_type_hint: str | None = "text/plain", +) -> UUID: + task_artifact_id = uuid4() + artifact = { + "id": task_artifact_id, + "task_id": task_id, + "ingestion_status": ingestion_status, + "relative_path": relative_path, + "media_type_hint": media_type_hint, + } + store.artifacts_by_id[task_artifact_id] = artifact + store.artifacts_by_task_id.setdefault(task_id, []).append(artifact) + return task_artifact_id + + +def semantic_artifact_row( + store: SemanticRetrievalStoreStub, + *, + task_id: UUID, + task_artifact_id: UUID, + relative_path: str, + score: float, + sequence_no: int, +) -> dict[str, object]: + return { + "id": uuid4(), + "user_id": uuid4(), + "task_id": task_id, + "task_artifact_id": task_artifact_id, + "relative_path": relative_path, + "media_type_hint": "text/plain", + "sequence_no": sequence_no, + "char_start": 0, + "char_end_exclusive": 11, + "text": f"{relative_path}-chunk", + "created_at": store.base_time + timedelta(minutes=sequence_no), + "updated_at": store.base_time + timedelta(minutes=sequence_no + 1), + "embedding_config_id": uuid4(), + "score": score, + } + + def test_retrieve_semantic_memory_records_returns_stable_shape_and_summary() -> None: store = SemanticRetrievalStoreStub() config_id = seed_config(store, dimensions=3) @@ -174,3 +283,181 @@ def test_retrieve_semantic_memory_records_rejects_non_active_memory_rows() -> No query_vector=(0.1, 0.2, 0.3), ), ) + + +def test_retrieve_task_scoped_semantic_artifact_chunk_records_returns_stable_shape_and_summary() -> None: + store = SemanticRetrievalStoreStub() + config_id = seed_config(store, dimensions=3) + task_id = seed_task(store) + first_artifact_id = seed_artifact( + store, + task_id=task_id, + relative_path="docs/a.txt", + ) + second_artifact_id = seed_artifact( + store, + task_id=task_id, + relative_path="notes/b.txt", + ) + pending_artifact_id = seed_artifact( + store, + task_id=task_id, + ingestion_status="pending", + relative_path="notes/pending.txt", + ) + first_row = semantic_artifact_row( + store, + task_id=task_id, + task_artifact_id=first_artifact_id, + relative_path="docs/a.txt", + score=1.0, + sequence_no=1, + ) + second_row = semantic_artifact_row( + store, + task_id=task_id, + task_artifact_id=second_artifact_id, + relative_path="notes/b.txt", + score=0.25, + sequence_no=1, + ) + store.task_artifact_retrieval_rows = [first_row, second_row] + + payload = retrieve_task_scoped_semantic_artifact_chunk_records( + store, # type: ignore[arg-type] + user_id=uuid4(), + request=TaskScopedSemanticArtifactChunkRetrievalInput( + task_id=task_id, + embedding_config_id=config_id, + query_vector=(1.0, 0.0, 0.0), + limit=2, + ), + ) + + assert payload == { + "items": [ + { + "id": str(first_row["id"]), + "task_id": str(task_id), + "task_artifact_id": str(first_artifact_id), + "relative_path": "docs/a.txt", + "media_type": "text/plain", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 11, + "text": "docs/a.txt-chunk", + "score": 1.0, + }, + { + "id": str(second_row["id"]), + "task_id": str(task_id), + "task_artifact_id": str(second_artifact_id), + "relative_path": "notes/b.txt", + "media_type": "text/plain", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 11, + "text": "notes/b.txt-chunk", + "score": 0.25, + }, + ], + "summary": { + "embedding_config_id": str(config_id), + "query_vector_dimensions": 3, + "limit": 2, + "returned_count": 2, + "searched_artifact_count": 2, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": {"kind": "task", "task_id": str(task_id)}, + }, + } + assert pending_artifact_id in store.artifacts_by_id + assert store.last_query == { + "scope": "task", + "task_id": task_id, + "embedding_config_id": config_id, + "query_vector": [1.0, 0.0, 0.0], + "limit": 2, + } + + +def test_retrieve_task_scoped_semantic_artifact_chunk_records_rejects_missing_task_and_dimension_mismatch() -> None: + store = SemanticRetrievalStoreStub() + config_id = seed_config(store, dimensions=3) + + with pytest.raises(TaskNotFoundError, match="task .* was not found"): + retrieve_task_scoped_semantic_artifact_chunk_records( + store, # type: ignore[arg-type] + user_id=uuid4(), + request=TaskScopedSemanticArtifactChunkRetrievalInput( + task_id=uuid4(), + embedding_config_id=config_id, + query_vector=(1.0, 0.0, 0.0), + ), + ) + + task_id = seed_task(store) + seed_artifact(store, task_id=task_id) + with pytest.raises( + SemanticArtifactChunkRetrievalValidationError, + match="query_vector length must match embedding config dimensions \\(3\\): 2", + ): + retrieve_task_scoped_semantic_artifact_chunk_records( + store, # type: ignore[arg-type] + user_id=uuid4(), + request=TaskScopedSemanticArtifactChunkRetrievalInput( + task_id=task_id, + embedding_config_id=config_id, + query_vector=(1.0, 0.0), + ), + ) + + +def test_retrieve_artifact_scoped_semantic_artifact_chunk_records_returns_empty_for_pending_artifact() -> None: + store = SemanticRetrievalStoreStub() + config_id = seed_config(store, dimensions=3) + task_id = seed_task(store) + artifact_id = seed_artifact( + store, + task_id=task_id, + ingestion_status="pending", + relative_path="notes/pending.txt", + media_type_hint="text/markdown", + ) + + payload = retrieve_artifact_scoped_semantic_artifact_chunk_records( + store, # type: ignore[arg-type] + user_id=uuid4(), + request=ArtifactScopedSemanticArtifactChunkRetrievalInput( + task_artifact_id=artifact_id, + embedding_config_id=config_id, + query_vector=(0.0, 1.0, 0.0), + limit=5, + ), + ) + + assert payload == { + "items": [], + "summary": { + "embedding_config_id": str(config_id), + "query_vector_dimensions": 3, + "limit": 5, + "returned_count": 0, + "searched_artifact_count": 0, + "similarity_metric": "cosine_similarity", + "order": ["score_desc", "relative_path_asc", "sequence_no_asc", "id_asc"], + "scope": { + "kind": "artifact", + "task_id": str(task_id), + "task_artifact_id": str(artifact_id), + }, + }, + } + assert store.last_query == { + "scope": "artifact", + "task_artifact_id": artifact_id, + "embedding_config_id": config_id, + "query_vector": [0.0, 1.0, 0.0], + "limit": 5, + } diff --git a/tests/unit/test_task_artifact_chunk_embedding_store.py b/tests/unit/test_task_artifact_chunk_embedding_store.py index 227a191..08704fb 100644 --- a/tests/unit/test_task_artifact_chunk_embedding_store.py +++ b/tests/unit/test_task_artifact_chunk_embedding_store.py @@ -234,3 +234,85 @@ def test_task_artifact_chunk_embedding_store_optional_reads_return_none_when_row task_artifact_chunk_id=uuid4(), embedding_config_id=uuid4(), ) is None + + +def test_semantic_artifact_chunk_retrieval_store_methods_use_expected_queries() -> None: + task_id = uuid4() + task_artifact_id = uuid4() + task_artifact_chunk_id = uuid4() + embedding_config_id = uuid4() + created_at = datetime(2026, 3, 15, 9, 0, tzinfo=UTC) + cursor = RecordingCursor( + fetchone_results=[], + fetchall_results=[ + [ + { + "id": task_artifact_chunk_id, + "user_id": uuid4(), + "task_id": task_id, + "task_artifact_id": task_artifact_id, + "relative_path": "docs/spec.txt", + "media_type_hint": "text/plain", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 11, + "text": "alpha chunk", + "created_at": created_at, + "updated_at": created_at, + "embedding_config_id": embedding_config_id, + "score": 1.0, + } + ], + [ + { + "id": task_artifact_chunk_id, + "user_id": uuid4(), + "task_id": task_id, + "task_artifact_id": task_artifact_id, + "relative_path": "docs/spec.txt", + "media_type_hint": "text/plain", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 11, + "text": "alpha chunk", + "created_at": created_at, + "updated_at": created_at, + "embedding_config_id": embedding_config_id, + "score": 1.0, + } + ], + ], + ) + store = ContinuityStore(RecordingConnection(cursor)) + + task_rows = store.retrieve_task_scoped_semantic_artifact_chunk_matches( + task_id=task_id, + embedding_config_id=embedding_config_id, + query_vector=[1.0, 0.0, 0.0], + limit=5, + ) + artifact_rows = store.retrieve_artifact_scoped_semantic_artifact_chunk_matches( + task_artifact_id=task_artifact_id, + embedding_config_id=embedding_config_id, + query_vector=[1.0, 0.0, 0.0], + limit=3, + ) + + assert task_rows[0]["task_id"] == task_id + assert artifact_rows[0]["task_artifact_id"] == task_artifact_id + + task_query, task_params = cursor.executed[0] + assert "FROM task_artifact_chunk_embeddings AS embeddings" in task_query + assert "JOIN task_artifacts AS artifacts" in task_query + assert "artifacts.task_id = %s" in task_query + assert "artifacts.ingestion_status = 'ingested'" in task_query + assert "ORDER BY score DESC, artifacts.relative_path ASC, chunks.sequence_no ASC, chunks.id ASC" in task_query + assert task_params == ("[1.0,0.0,0.0]", embedding_config_id, 3, task_id, 5) + + artifact_query, artifact_params = cursor.executed[1] + assert "FROM task_artifact_chunk_embeddings AS embeddings" in artifact_query + assert "JOIN task_artifacts AS artifacts" in artifact_query + assert "artifacts.id = %s" in artifact_query + assert "artifacts.ingestion_status = 'ingested'" in artifact_query + assert "ORDER BY score DESC, artifacts.relative_path ASC, chunks.sequence_no ASC, chunks.id ASC" in artifact_query + assert artifact_params == ("[1.0,0.0,0.0]", embedding_config_id, 3, task_artifact_id, 3)