diff --git a/.ai/active/SPRINT_PACKET.md b/.ai/active/SPRINT_PACKET.md index c42b6a1..eb2e402 100644 --- a/.ai/active/SPRINT_PACKET.md +++ b/.ai/active/SPRINT_PACKET.md @@ -2,7 +2,7 @@ ## Sprint Title -Sprint 5L: PDF Artifact Parsing V0 +Sprint 5M: DOCX Artifact Parsing V0 ## Sprint Type @@ -10,15 +10,15 @@ feature ## Sprint Reason -Sprint 5K re-synchronized project truth through Sprint 5J, and the agreed next delivery focus is richer document parsing on top of the shipped rooted workspace, durable chunk, and hybrid artifact compile baseline. The narrowest safe next slice is PDF ingestion only, not a broad “rich documents” sprint that mixes PDF, DOCX, OCR, connectors, or UI. +Sprint 5L proved the richer-document-parsing seam can widen safely without changing the rooted workspace, durable chunk, retrieval, or compile contracts. The next safe slice is DOCX ingestion only, not broader PDF compatibility, OCR, connectors, or UI. ## Sprint Intent -Extend the existing artifact-ingestion seam so registered PDF artifacts can be ingested into the existing durable `task_artifact_chunks` substrate through deterministic text extraction, without changing retrieval contracts, compile contracts, connectors, or UI. +Extend the existing artifact-ingestion seam so registered DOCX artifacts can be ingested into the existing durable `task_artifact_chunks` substrate through deterministic local text extraction, without changing retrieval contracts, compile contracts, connectors, or UI. ## Git Instructions -- Branch Name: `codex/sprint-5l-pdf-artifact-parsing-v0` +- Branch Name: `codex/sprint-5m-docx-artifact-parsing-v0` - Base Branch: `main` - PR Strategy: one sprint branch, one PR, no stacked PRs unless Control Tower explicitly opens a follow-up sprint - Merge Policy: squash merge only after reviewer `PASS` and explicit Control Tower merge approval @@ -29,39 +29,39 @@ Extend the existing artifact-ingestion seam so registered PDF artifacts can be i - Sprint 5C shipped explicit task-artifact registration. - Sprint 5D shipped deterministic local text-artifact ingestion into durable chunk rows. - Sprint 5E through 5J shipped lexical retrieval, semantic retrieval, and hybrid compile-path artifact retrieval on top of those persisted chunk rows. -- Sprint 5K re-synchronized the truth artifacts and explicitly set richer document parsing as the next narrow move. -- The safest next step is PDF extraction only, feeding the already-shipped chunk and retrieval seams without expanding into OCR, DOCX, connectors, or UI. +- Sprint 5L extended the same ingestion seam to narrow PDF text extraction without changing retrieval or compile contracts. +- The next narrow richer-document move is a separate DOCX ingestion seam, which increases format coverage without widening into OCR, connector, or UI scope. ## In Scope -- Extend schema and contracts only as narrowly needed to support PDF ingestion metadata, for example: +- Extend schema and contracts only as narrowly needed to support DOCX ingestion metadata, for example: - `task_artifacts.ingestion_status` reuse if no new status is required - optional deterministic extraction metadata on artifact detail or ingestion responses if needed - Define typed contracts for: - - PDF artifact-ingestion requests if they differ from the current generic artifact-ingestion path - - artifact-ingestion responses updated for PDF extraction metadata if needed - - artifact detail or chunk summary metadata updated for PDF ingestion if needed + - DOCX artifact-ingestion requests if they differ from the current generic artifact-ingestion path + - artifact-ingestion responses updated for DOCX extraction metadata if needed + - artifact detail or chunk summary metadata updated for DOCX ingestion if needed - Extend the existing ingestion seam so it: - - accepts already-registered visible PDF artifacts + - accepts already-registered visible DOCX artifacts - resolves rooted local file paths from persisted workspace plus artifact relative path - - supports one explicit PDF extraction path only - - extracts deterministic text from PDFs without OCR + - supports one explicit DOCX extraction path only + - extracts deterministic text from DOCX package contents without OCR or image extraction - normalizes extracted text before chunking - persists ordered chunk rows into the existing `task_artifact_chunks` table - updates artifact ingestion status deterministically - Add unit and integration tests for: - - supported PDF ingestion - - deterministic chunk ordering and chunk boundaries from extracted PDF text - - rooted path enforcement during PDF ingestion - - rejection of scanned-image or textless PDFs when no extractable text is present + - supported DOCX ingestion + - deterministic chunk ordering and chunk boundaries from extracted DOCX text + - rooted path enforcement during DOCX ingestion + - rejection of malformed or textless DOCX files when no extractable text is present - per-user isolation - stable response shape ## Out of Scope -- No DOCX ingestion. +- No broader PDF compatibility work. - No OCR. -- No image extraction. +- No image extraction from DOCX. - No changes to lexical retrieval contracts. - No changes to semantic retrieval contracts. - No compile contract changes. @@ -71,58 +71,58 @@ Extend the existing artifact-ingestion seam so registered PDF artifacts can be i ## Required Deliverables -- Narrow ingestion support for visible PDF artifacts using the existing artifact and chunk seams. -- Stable contract updates only where PDF extraction metadata is necessary. -- Unit and integration coverage for PDF extraction, rooted-path safety, deterministic chunk persistence, and isolation. +- Narrow ingestion support for visible DOCX artifacts using the existing artifact and chunk seams. +- Stable contract updates only where DOCX extraction metadata is necessary. +- Unit and integration coverage for DOCX extraction, rooted-path safety, deterministic chunk persistence, and isolation. - Updated `BUILD_REPORT.md` with exact verification results and explicit deferred scope. ## Acceptance Criteria -- A client can ingest one supported visible PDF artifact into durable ordered chunk rows using the existing artifact-ingestion seam. -- PDF ingestion reads only files rooted under the persisted task workspace boundary. +- A client can ingest one supported visible DOCX artifact into durable ordered chunk rows using the existing artifact-ingestion seam. +- DOCX ingestion reads only files rooted under the persisted task workspace boundary. - Extracted text is normalized and chunked deterministically into the existing `task_artifact_chunks` contract. -- Textless or unsupported PDFs are rejected deterministically rather than silently producing misleading chunks. +- Malformed or textless DOCX files are rejected deterministically rather than silently producing misleading chunks. - Existing lexical, semantic, and hybrid artifact retrieval contracts continue to operate over the persisted chunk rows without contract changes. - `./.venv/bin/python -m pytest tests/unit` passes. - `./.venv/bin/python -m pytest tests/integration` passes. -- No DOCX, OCR, connector, runner, compile-contract, or UI scope enters the sprint. +- No PDF-compatibility expansion, OCR, connector, runner, compile-contract, or UI scope enters the sprint. ## Implementation Constraints - Keep richer parsing narrow and boring. - Reuse the existing rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams rather than creating a parallel document store. -- Support PDF text extraction only; do not introduce OCR fallback in the same sprint. +- Support DOCX text extraction only; do not introduce OCR, image extraction, or document-layout reconstruction in the same sprint. - Preserve existing retrieval and compile contracts by feeding the already-shipped chunk substrate. - Keep extraction and chunking deterministic and testable from local files alone. ## Suggested Work Breakdown -1. Define any minimal PDF-ingestion contract updates needed. -2. Implement deterministic rooted PDF text extraction in the existing artifact-ingestion seam. +1. Define any minimal DOCX-ingestion contract updates needed. +2. Implement deterministic rooted DOCX text extraction in the existing artifact-ingestion seam. 3. Normalize extracted text and persist ordered chunk rows into the existing chunk store. -4. Add deterministic failure behavior for textless PDFs. +4. Add deterministic failure behavior for malformed or textless DOCX files. 5. Add unit and integration tests. 6. Update `BUILD_REPORT.md` with executed verification. ## Build Report Requirements `BUILD_REPORT.md` must include: -- the exact PDF-ingestion contract changes introduced, if any -- the PDF extraction path and chunking rule used +- the exact DOCX-ingestion contract changes introduced, if any +- the DOCX extraction path and chunking rule used - exact commands run - unit and integration test results -- one example PDF artifact-ingestion response -- one example chunk list response produced from a PDF artifact +- one example DOCX artifact-ingestion response +- one example chunk list response produced from a DOCX artifact - what remains intentionally deferred to later milestones ## Review Focus `REVIEW_REPORT.md` should verify: -- the sprint stayed limited to PDF artifact parsing through the existing ingestion seam -- PDF ingestion reuses the existing rooted workspace, artifact, and chunk contracts +- the sprint stayed limited to DOCX artifact parsing through the existing ingestion seam +- DOCX ingestion reuses the existing rooted workspace, artifact, and chunk contracts - extraction determinism, chunk ordering, rooted-path safety, and isolation are test-backed -- no hidden DOCX, OCR, connector, runner, compile-contract, or UI scope entered the sprint +- no hidden PDF-compatibility expansion, OCR, connector, runner, compile-contract, or UI scope entered the sprint ## Exit Condition -This sprint is complete when the repo can ingest supported visible PDF artifacts into deterministic durable chunk rows through the existing artifact-ingestion seam, verify the full path with Postgres-backed tests, and still defer broader document parsing, connectors, and UI. +This sprint is complete when the repo can ingest supported visible DOCX artifacts into deterministic durable chunk rows through the existing artifact-ingestion seam, verify the full path with Postgres-backed tests, and still defer broader document parsing, connectors, and UI. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 168429a..faf17a5 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -2,16 +2,16 @@ ## Current Implemented Slice -AliceBot now implements the accepted repo slice through Sprint 5J. The shipped backend includes: +AliceBot now implements the accepted repo slice through Sprint 5M. The shipped backend includes: - foundation continuity storage over `users`, `threads`, `sessions`, and append-only `events` - deterministic tracing and context compilation over durable continuity, memory, entity, and entity-edge records - governed memory admission, explicit-preference extraction, memory review labels, review queue reads, evaluation summary reads, explicit embedding config and memory-embedding storage, direct semantic retrieval, and deterministic hybrid compile-path memory merge - deterministic prompt assembly and one no-tools response path that persists assistant replies as immutable continuity events - user-scoped consents, policies, policy evaluation, tool registry, allowlist evaluation, tool routing, approval request persistence, approval resolution, approved-only proxy execution through the in-process `proxy.echo` handler, durable execution review, and execution-budget lifecycle plus enforcement -- durable `tasks`, `task_steps`, `task_workspaces`, `task_artifacts`, `task_artifact_chunks`, and `task_artifact_chunk_embeddings`, deterministic task-step sequencing, explicit task-step transitions, explicit manual continuation with lineage through `parent_step_id`, `source_approval_id`, and `source_execution_id`, explicit `tool_executions.task_step_id` linkage for execution synchronization, deterministic rooted local task-workspace provisioning, explicit rooted local artifact registration, deterministic local text-artifact ingestion into durable chunk rows, deterministic lexical artifact-chunk retrieval over durable chunk rows, explicit user-scoped artifact-chunk embedding persistence tied to existing embedding configs, explicit task-scoped or artifact-scoped semantic artifact-chunk retrieval over those durable embeddings, and compile-path artifact retrieval that can include lexical results, semantic results, or one deterministic hybrid lexical-plus-semantic merged artifact section with per-chunk source provenance +- durable `tasks`, `task_steps`, `task_workspaces`, `task_artifacts`, `task_artifact_chunks`, and `task_artifact_chunk_embeddings`, deterministic task-step sequencing, explicit task-step transitions, explicit manual continuation with lineage through `parent_step_id`, `source_approval_id`, and `source_execution_id`, explicit `tool_executions.task_step_id` linkage for execution synchronization, deterministic rooted local task-workspace provisioning, explicit rooted local artifact registration, deterministic local plain-text, markdown, narrow PDF text, and narrow DOCX text ingestion into durable chunk rows, deterministic lexical artifact-chunk retrieval over durable chunk rows, explicit user-scoped artifact-chunk embedding persistence tied to existing embedding configs, explicit task-scoped or artifact-scoped semantic artifact-chunk retrieval over those durable embeddings, and compile-path artifact retrieval that can include lexical results, semantic results, or one deterministic hybrid lexical-plus-semantic merged artifact section with per-chunk source provenance -The current multi-step boundary is narrow and explicit. Manual continuation is implemented and review-passed. Approval resolution and proxy execution now both use explicit task-step linkage rather than first-step inference. Task workspaces are now implemented only as deterministic rooted local boundaries, and task artifacts are now implemented only as explicit rooted local-file registrations, narrow deterministic text ingestion under those workspaces, lexical retrieval over persisted chunk rows, explicit artifact-chunk embedding storage tied to existing embedding configs, direct semantic retrieval over those durable artifact-chunk embeddings for one visible task or one visible artifact at a time, and compile-path artifact retrieval that deterministically merges lexical and semantic candidates into one artifact section when both are requested for the same scope. Broader runner-style orchestration, automatic multi-step progression, richer document parsing, connectors, reranking beyond the current lexical-first hybrid merge, and new side-effect surfaces are still planned later and must not be described as live behavior. +The current multi-step boundary is narrow and explicit. Manual continuation is implemented and review-passed. Approval resolution and proxy execution now both use explicit task-step linkage rather than first-step inference. Task workspaces are now implemented only as deterministic rooted local boundaries, and task artifacts are now implemented only as explicit rooted local-file registrations, narrow deterministic artifact ingestion under those workspaces, lexical retrieval over persisted chunk rows, explicit artifact-chunk embedding storage tied to existing embedding configs, direct semantic retrieval over those durable artifact-chunk embeddings for one visible task or one visible artifact at a time, and compile-path artifact retrieval that deterministically merges lexical and semantic candidates into one artifact section when both are requested for the same scope. The live richer-document boundary is still intentionally narrow: plain text and markdown ingest directly, PDF support is limited to narrow local text extraction, and DOCX support is limited to narrow local text extraction from `word/document.xml`; OCR, image extraction, layout reconstruction, connectors, reranking beyond the current lexical-first hybrid merge, and new side-effect surfaces are still planned later and must not be described as live behavior. ## Implemented Now @@ -62,7 +62,7 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i - `apps/web`: minimal shell only; no shipped workflow UI. - `workers`: scaffold only; no background jobs or runner logic are implemented. - `infra`: local development bootstrap assets only. -- `tests`: unit and Postgres-backed integration coverage for the shipped seams above, including Sprint 4O task-step lineage/manual continuation, Sprint 4S step-linked execution synchronization, Sprint 5A task-workspace provisioning, Sprint 5C task-artifact registration, Sprint 5D local artifact ingestion plus chunk reads, Sprint 5E lexical artifact-chunk retrieval, Sprint 5F compile-path artifact chunk integration, Sprint 5G artifact-chunk embedding persistence and reads, Sprint 5H direct semantic artifact-chunk retrieval, Sprint 5I compile-path semantic artifact retrieval, and Sprint 5J deterministic hybrid lexical-plus-semantic artifact merge in compile. +- `tests`: unit and Postgres-backed integration coverage for the shipped seams above, including Sprint 4O task-step lineage/manual continuation, Sprint 4S step-linked execution synchronization, Sprint 5A task-workspace provisioning, Sprint 5C task-artifact registration, Sprint 5D local artifact ingestion plus chunk reads, Sprint 5E lexical artifact-chunk retrieval, Sprint 5F compile-path artifact chunk integration, Sprint 5G artifact-chunk embedding persistence and reads, Sprint 5H direct semantic artifact-chunk retrieval, Sprint 5I compile-path semantic artifact retrieval, Sprint 5J deterministic hybrid lexical-plus-semantic artifact merge in compile, Sprint 5L narrow PDF artifact ingestion, and Sprint 5M narrow DOCX artifact ingestion. ## Core Flows Implemented Now @@ -198,14 +198,17 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i 1. Accept a user-scoped `POST /v0/task-artifacts/{task_artifact_id}/ingest` request for one visible registered artifact. 2. Lock ingestion for that artifact before deciding whether work is needed. 3. Resolve the persisted workspace `local_path` plus persisted artifact `relative_path`, and reject any rooted-path escape deterministically. -4. Support only the narrow explicit text set: `text/plain` and `text/markdown`. -5. Read file bytes deterministically and require valid UTF-8 text. -6. Normalize line endings by rewriting `\r\n` and `\r` to `\n`. -7. Chunk normalized text deterministically with rule `normalized_utf8_text_fixed_window_1000_chars_v1`. -8. Persist ordered `task_artifact_chunks` rows with `sequence_no`, `char_start`, `char_end_exclusive`, and `text`. -9. Update the parent artifact to `ingestion_status = ingested`. -10. If the artifact is already ingested, return the existing artifact and chunk summary without reinserting chunks. -11. `GET /v0/task-artifacts/{task_artifact_id}/chunks` returns visible chunk rows in deterministic `sequence_no ASC, id ASC` order plus stable summary metadata. +4. Support only the current narrow explicit set: `text/plain`, `text/markdown`, narrow local `application/pdf` text extraction, and narrow local `application/vnd.openxmlformats-officedocument.wordprocessingml.document` text extraction from `word/document.xml`. +5. For plain text and markdown, read file bytes deterministically and require valid UTF-8 text. +6. For PDFs, extract only narrow local text content; OCR, image extraction, and broader PDF compatibility remain out of scope. +7. For DOCX, extract only narrow local text from `word/document.xml`; OCR, image extraction, headers/footers/comments expansion, and layout reconstruction remain out of scope. +8. Reject malformed or textless richer-document inputs deterministically instead of producing misleading chunks. +9. Normalize line endings by rewriting `\r\n` and `\r` to `\n`. +10. Chunk normalized text deterministically with rule `normalized_utf8_text_fixed_window_1000_chars_v1`. +11. Persist ordered `task_artifact_chunks` rows with `sequence_no`, `char_start`, `char_end_exclusive`, and `text`. +12. Update the parent artifact to `ingestion_status = ingested`. +13. If the artifact is already ingested, return the existing artifact and chunk summary without reinserting chunks. +14. `GET /v0/task-artifacts/{task_artifact_id}/chunks` returns visible chunk rows in deterministic `sequence_no ASC, id ASC` order plus stable summary metadata. ### Artifact Chunk Retrieval diff --git a/BUILD_REPORT.md b/BUILD_REPORT.md index d3fcf57..74a1fd5 100644 --- a/BUILD_REPORT.md +++ b/BUILD_REPORT.md @@ -2,50 +2,60 @@ ## sprint objective -Implement narrow PDF artifact parsing on the existing artifact-ingestion seam so already-registered visible PDF artifacts can be ingested into durable `task_artifact_chunks` rows without changing retrieval contracts, compile contracts, connectors, or UI. +Implement narrow DOCX artifact parsing on the existing artifact-ingestion seam so already-registered visible DOCX artifacts can be ingested into durable `task_artifact_chunks` rows without changing retrieval contracts, compile contracts, connectors, or UI. ## completed work -- Extended artifact media-type inference and validation to accept `application/pdf` and `.pdf` artifacts on the existing ingestion path. -- Implemented deterministic local PDF text extraction in `apps/api/src/alicebot_api/artifacts.py` by: - - parsing the PDF object graph from local bytes only - - walking the catalog/pages tree in page order - - reading `/Contents` streams only - - supporting unfiltered streams and `/FlateDecode` streams only - - extracting text from PDF text-show operators (`Tj`, `TJ`, `'`, `"`) only - - rejecting PDFs that are invalid, textless, or use unsupported stream filters/structures -- Kept ingestion status behavior unchanged: successful PDF ingestion updates `task_artifacts.ingestion_status` to `ingested`; rejected PDFs remain on the existing pending path and do not introduce a new status. -- Reused the existing normalization and chunking seam after extraction: - - line endings normalize through `normalize_artifact_text()` - - chunk persistence still uses `task_artifact_chunks` - - chunk rule remains `normalized_utf8_text_fixed_window_1000_chars_v1` +- Extended the existing artifact media-type support to accept DOCX artifacts: + - media type: `application/vnd.openxmlformats-officedocument.wordprocessingml.document` + - extension inference: `.docx` +- Implemented deterministic local DOCX text extraction in `apps/api/src/alicebot_api/artifacts.py` by: + - opening local DOCX bytes as a ZIP package + - reading `word/document.xml` only + - parsing WordprocessingML locally with `xml.etree.ElementTree` + - extracting paragraph text in document order from `w:t` + - preserving explicit DOCX tabs and line breaks via `w:tab`, `w:br`, and `w:cr` + - joining non-empty paragraphs with `\n` + - rejecting malformed packages/XML as invalid DOCX + - rejecting textless DOCX files when no extractable text is present +- Reused the existing ingestion seam after extraction: + - rooted workspace path enforcement remains unchanged + - normalization still runs through `normalize_artifact_text()` + - chunk persistence still targets `task_artifact_chunks` + - ingestion status still transitions from `pending` to `ingested` on success +- Kept retrieval/compile contracts unchanged while updating extension-based media-type inference for semantic artifact retrieval so `.docx` artifacts remain typed consistently when `media_type_hint` is absent. - Added unit coverage for: - - deterministic PDF chunk persistence - - textless PDF rejection - - PDF-rooted path enforcement - - updated unsupported-media validation + - deterministic DOCX chunk persistence + - stable unsupported-media validation text + - textless DOCX rejection + - malformed DOCX rejection + - rooted DOCX path enforcement - Added integration coverage for: - - supported PDF ingestion with stable response shape - - deterministic PDF chunk ordering and boundaries - - PDF per-user isolation - - textless PDF rejection - - rooted-path enforcement during PDF ingestion + - supported DOCX ingestion with stable response shape + - deterministic DOCX chunk ordering and boundaries + - per-user isolation for DOCX ingestion/chunk listing + - textless DOCX rejection + - malformed DOCX rejection + - rooted-path enforcement during DOCX ingestion -## exact PDF-ingestion contract changes introduced +## exact DOCX-ingestion contract changes introduced - No request contract changes. -- No response contract shape changes. +- No response shape changes. - No schema changes. -- The only contract-level behavior change is that the existing artifact-ingestion seam now accepts `application/pdf` as a supported artifact media type and continues returning the existing `TaskArtifactIngestionResponse` and `TaskArtifactChunkListResponse` shapes. +- Existing artifact-ingestion behavior now additionally accepts `application/vnd.openxmlformats-officedocument.wordprocessingml.document`. +- Extension-based media-type inference now recognizes `.docx` for the existing artifact and semantic-retrieval response paths. -## PDF extraction path and chunking rule used +## DOCX extraction path and chunking rule used - Extraction path: - existing `POST /v0/task-artifacts/{task_artifact_id}/ingest` - resolve persisted workspace `local_path` plus persisted artifact `relative_path` - enforce rooted workspace boundary before any read - - for PDFs, parse local file bytes, walk the PDF page tree, decode supported content streams, and extract text-show operations in deterministic stream order - - reject if no extractable text is found + - read the local DOCX package from disk + - extract text from `word/document.xml` only + - emit paragraph-ordered text from `w:t`, `w:tab`, `w:br`, and `w:cr` + - reject invalid or textless DOCX artifacts deterministically - Chunking rule: - normalize extracted text with CRLF/CR to LF conversion - split into fixed windows of 1000 characters @@ -54,11 +64,12 @@ Implement narrow PDF artifact parsing on the existing artifact-ingestion seam so ## incomplete work -- None within Sprint 5L scope. +- None within Sprint 5M scope. ## files changed - `apps/api/src/alicebot_api/artifacts.py` +- `apps/api/src/alicebot_api/semantic_retrieval.py` - `tests/unit/test_artifacts.py` - `tests/unit/test_artifacts_main.py` - `tests/integration/test_task_artifacts_api.py` @@ -67,23 +78,23 @@ Implement narrow PDF artifact parsing on the existing artifact-ingestion seam so ## tests run - `./.venv/bin/python -m pytest tests/unit/test_artifacts.py tests/unit/test_artifacts_main.py` - - Result: `40 passed in 0.45s` + - Result: `44 passed in 0.43s` - `./.venv/bin/python -m pytest tests/integration/test_task_artifacts_api.py` - - First sandboxed attempt failed because the suite could not reach the local Postgres instance (`psycopg.OperationalError: ... Operation not permitted`). + - Result: blocked in the sandbox because local Postgres access was denied (`psycopg.OperationalError: ... Operation not permitted`) - `./.venv/bin/python -m pytest tests/unit` - - Result: `382 passed in 0.73s` + - Result: `386 passed in 0.63s` - `./.venv/bin/python -m pytest tests/integration` - - Result: `120 passed in 34.65s` + - Result: `123 passed in 36.27s` ## unit and integration test results - Unit suite passed in full. -- Integration suite passed in full against Postgres-backed tests. -- The new PDF-specific integration coverage is included in the passing `tests/integration/test_task_artifacts_api.py` module. +- Integration suite passed in full against the Postgres-backed test path. +- The DOCX-specific API coverage is included in the passing `tests/integration/test_task_artifacts_api.py` module. -## one example PDF artifact-ingestion response +## one example DOCX artifact-ingestion response -Example verified by the PDF integration assertion: +Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_deterministic_and_isolated`: ```json { @@ -93,24 +104,24 @@ Example verified by the PDF integration assertion: "task_workspace_id": "", "status": "registered", "ingestion_status": "ingested", - "relative_path": "docs/spec.pdf", - "media_type_hint": "application/pdf", + "relative_path": "docs/spec.docx", + "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "created_at": "", "updated_at": "" }, "summary": { "total_count": 2, "total_characters": 1006, - "media_type": "application/pdf", + "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", "order": ["sequence_no_asc", "id_asc"] } } ``` -## one example chunk list response produced from a PDF artifact +## one example chunk list response produced from a DOCX artifact -Example verified by the PDF integration assertion: +Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_deterministic_and_isolated`: ```json { @@ -139,7 +150,7 @@ Example verified by the PDF integration assertion: "summary": { "total_count": 2, "total_characters": 1006, - "media_type": "application/pdf", + "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", "order": ["sequence_no_asc", "id_asc"] } @@ -148,22 +159,23 @@ Example verified by the PDF integration assertion: ## blockers/issues -- No repo-level implementation blockers remained. -- The integration suite required elevated access to reach the local Postgres test instance from this environment; after rerunning with that access, the full integration suite passed. +- No implementation blockers remained. +- The first direct integration-test attempt from the sandbox could not reach local Postgres; rerunning the required integration suite with elevated local access succeeded. ## what remains intentionally deferred to later milestones -- DOCX ingestion +- broader PDF compatibility work - OCR -- image extraction from PDFs +- image extraction from DOCX +- document-layout reconstruction +- headers/footers/comments/track-changes-specific DOCX extraction expansion - connector work - runner-style orchestration - retrieval-contract changes - semantic-contract changes - compile-contract changes - UI work -- broader PDF compatibility beyond the current narrow text-only local content-stream extraction path ## recommended next step -Open a follow-up sprint only if broader document coverage is needed, starting with an explicit decision on whether to extend PDF compatibility further or add a separate DOCX ingestion seam without changing the current retrieval and compile contracts. +If richer document support is needed later, open a separate sprint for either broader DOCX coverage beyond `word/document.xml` or broader PDF compatibility, but keep both on the existing rooted artifact/chunk seam. diff --git a/REVIEW_REPORT.md b/REVIEW_REPORT.md index 0fa3d0c..00d76b3 100644 --- a/REVIEW_REPORT.md +++ b/REVIEW_REPORT.md @@ -1,39 +1,33 @@ verdict: PASS criteria met -- The sprint stayed within the existing artifact-ingestion seam. The runtime diff is limited to [artifacts.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/artifacts.py), the artifact-focused tests, and report files; no connector, runner, compile-contract, or UI code entered scope. -- PDF ingestion reuses the rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams without schema or response-shape changes. `application/pdf` is accepted on the existing ingest path and the existing `TaskArtifactIngestionResponse` / chunk-list shapes are preserved. -- Rooted-path safety is enforced during PDF ingestion. Both unit and Postgres-backed integration coverage verify that a persisted relative-path escape is rejected deterministically. -- Extracted PDF text is normalized and chunked deterministically into ordered `task_artifact_chunks` rows. The new tests verify stable 1000-character boundaries, `sequence_no` ordering, and stable chunk-list summary metadata. -- Textless PDFs are rejected deterministically instead of producing misleading chunks. Unit and integration tests both assert the explicit `does not contain extractable PDF text` failure. -- Per-user isolation remains intact for PDF artifacts. The integration suite verifies that another user cannot ingest or list chunks for the owner’s registered PDF artifact. -- Acceptance-suite verification was rerun during review: -- `./.venv/bin/python -m pytest tests/unit/test_artifacts.py tests/unit/test_artifacts_main.py` -> `40 passed` -- `./.venv/bin/python -m pytest tests/integration/test_task_artifacts_api.py` -> `8 passed` -- `./.venv/bin/python -m pytest tests/unit` -> `382 passed` -- `./.venv/bin/python -m pytest tests/integration` -> `120 passed` +- Sprint 5M remains within the existing artifact-ingestion seam. The runtime changes are still limited to DOCX ingestion in `apps/api/src/alicebot_api/artifacts.py` plus the narrow `.docx` media-type fallback in `apps/api/src/alicebot_api/semantic_retrieval.py`; no connector, runner, compile-contract, or UI scope entered the sprint. +- DOCX ingestion still reuses the rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams without schema or response-shape changes. +- Rooted-path safety, deterministic chunk persistence, malformed/textless DOCX rejection, and per-user isolation remain covered by the unit and Postgres-backed integration tests already reviewed. +- The previously missing regression coverage is now present in `tests/unit/test_semantic_retrieval.py`: a `.docx` artifact with `media_type_hint=None` is exercised directly, and the semantic retrieval response is asserted to infer `application/vnd.openxmlformats-officedocument.wordprocessingml.document`. +- `ARCHITECTURE.md` now matches the shipped slice: it reports scope through Sprint 5M, describes the narrow PDF and DOCX ingestion boundary accurately, and keeps OCR/image/layout work explicitly deferred. +- Review verification: +- prior review verification still stands: `./.venv/bin/python -m pytest tests/unit` -> `386 passed in 0.56s` +- prior review verification still stands: `./.venv/bin/python -m pytest tests/integration` -> `123 passed in 38.04s` +- follow-up verification rerun in this review: `./.venv/bin/python -m pytest tests/unit/test_semantic_retrieval.py` -> `8 passed in 0.08s` criteria missed - None. quality issues -- No blocking implementation defects were found in the Sprint 5L scope. -- The PDF parser is intentionally narrow. It only covers direct local content-stream text extraction with unfiltered and `/FlateDecode` streams, and broader compatibility remains outside this sprint. That matches the packet’s narrow-slice intent and is documented as deferred, so it is not a blocker. +- No blocking implementation or coverage issues remain for Sprint 5M scope. regression risks -- Retrieval, semantic retrieval, and hybrid compile behavior remain on the unchanged chunk substrate and the full integration suite still passes, but there is no new PDF-specific end-to-end retrieval or compile assertion. That is a residual regression risk, not a current failure. -- Future widening of PDF support should treat parser compatibility as explicit scope. The implementation is deliberately not a general-purpose PDF engine. +- No new regression risks beyond the intentionally narrow richer-document boundaries already documented in the sprint packet and architecture notes. docs issues -- `BUILD_REPORT.md` is complete enough for the sprint packet and matches the shipped diff. -- Optional follow-up only: if the team wants archival clarity, spell out in future milestone docs that “supported PDF” currently means the narrow text-only extraction path implemented in [artifacts.py](/Users/samirusani/Desktop/Codex/AliceBot/apps/api/src/alicebot_api/artifacts.py), not general PDF compatibility. +- None. `BUILD_REPORT.md` and `ARCHITECTURE.md` are consistent with the implemented slice and the review expectations. should anything be added to RULES.md? - No. should anything update ARCHITECTURE.md? -- No. The sprint does not change the core architecture boundaries; it extends the existing artifact-ingestion seam without altering the workspace, artifact, chunk, retrieval, or compile contracts. +- No further update is required for this sprint. recommended next action -- Accept Sprint 5L as complete and merge after normal approval flow. -- If the next milestone stays on richer document parsing, add one PDF-backed retrieval or compile regression test before widening into broader PDF compatibility, DOCX, or OCR. +- Accept Sprint 5M as complete and merge after normal approval flow. diff --git a/apps/api/src/alicebot_api/artifacts.py b/apps/api/src/alicebot_api/artifacts.py index 3e12638..734f251 100644 --- a/apps/api/src/alicebot_api/artifacts.py +++ b/apps/api/src/alicebot_api/artifacts.py @@ -1,11 +1,14 @@ from __future__ import annotations +import io import re import zlib from dataclasses import dataclass from pathlib import Path from typing import cast from uuid import UUID +import xml.etree.ElementTree as ET +import zipfile import psycopg @@ -40,9 +43,13 @@ SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES = ("text/plain", "text/markdown") SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE = "application/pdf" +SUPPORTED_DOCX_ARTIFACT_MEDIA_TYPE = ( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" +) SUPPORTED_ARTIFACT_MEDIA_TYPES = ( *SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES, SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE, + SUPPORTED_DOCX_ARTIFACT_MEDIA_TYPE, ) SUPPORTED_ARTIFACT_EXTENSIONS = { ".txt": "text/plain", @@ -50,6 +57,7 @@ ".md": "text/markdown", ".markdown": "text/markdown", ".pdf": SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE, + ".docx": SUPPORTED_DOCX_ARTIFACT_MEDIA_TYPE, } TASK_ARTIFACT_CHUNK_MAX_CHARS = 1000 TASK_ARTIFACT_CHUNKING_RULE = "normalized_utf8_text_fixed_window_1000_chars_v1" @@ -132,6 +140,14 @@ b"W*", b"y", } +_DOCX_DOCUMENT_XML_PATH = "word/document.xml" +_DOCX_WORDPROCESSING_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +_DOCX_PARAGRAPH_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}p" +_DOCX_TEXT_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}t" +_DOCX_TAB_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}tab" +_DOCX_BREAK_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}br" +_DOCX_CARRIAGE_RETURN_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}cr" +_DOCX_BODY_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}body" @dataclass(frozen=True, slots=True) @@ -268,6 +284,49 @@ def _extract_text_from_utf8_artifact_bytes(*, relative_path: str, payload: bytes ) from exc +def _extract_text_from_docx_paragraph(paragraph: ET.Element) -> str: + fragments: list[str] = [] + for element in paragraph.iter(): + if element.tag == _DOCX_TEXT_TAG: + fragments.append(element.text or "") + continue + if element.tag == _DOCX_TAB_TAG: + fragments.append("\t") + continue + if element.tag in {_DOCX_BREAK_TAG, _DOCX_CARRIAGE_RETURN_TAG}: + fragments.append("\n") + return "".join(fragments) + + +def _extract_text_from_docx_artifact_bytes(*, relative_path: str, payload: bytes) -> str: + try: + with zipfile.ZipFile(io.BytesIO(payload)) as archive: + document_xml = archive.read(_DOCX_DOCUMENT_XML_PATH) + except (KeyError, zipfile.BadZipFile) as exc: + raise TaskArtifactValidationError(f"artifact {relative_path} is not a valid DOCX") from exc + + try: + document_root = ET.fromstring(document_xml) + except ET.ParseError as exc: + raise TaskArtifactValidationError(f"artifact {relative_path} is not a valid DOCX") from exc + + document_body = document_root.find(_DOCX_BODY_TAG) + if document_body is None: + raise TaskArtifactValidationError(f"artifact {relative_path} is not a valid DOCX") + + paragraphs = [ + paragraph_text + for paragraph in document_body.iter(_DOCX_PARAGRAPH_TAG) + if (paragraph_text := _extract_text_from_docx_paragraph(paragraph)) != "" + ] + extracted_text = "\n".join(paragraphs).strip() + if extracted_text == "": + raise TaskArtifactValidationError( + f"artifact {relative_path} does not contain extractable DOCX text" + ) + return extracted_text + + def _extract_pdf_name(dictionary: bytes, key: bytes) -> bytes | None: match = re.search(rb"/" + re.escape(key) + rb"\s*/([A-Za-z0-9_.#-]+)", dictionary) if match is None: @@ -742,6 +801,11 @@ def extract_artifact_text(*, row: TaskArtifactRow, artifact_path: Path, media_ty relative_path=row["relative_path"], payload=payload, ) + if media_type == SUPPORTED_DOCX_ARTIFACT_MEDIA_TYPE: + return _extract_text_from_docx_artifact_bytes( + relative_path=row["relative_path"], + payload=payload, + ) raise TaskArtifactValidationError( f"artifact {row['relative_path']} has unsupported media type {media_type}" ) diff --git a/apps/api/src/alicebot_api/semantic_retrieval.py b/apps/api/src/alicebot_api/semantic_retrieval.py index 50059a1..5145062 100644 --- a/apps/api/src/alicebot_api/semantic_retrieval.py +++ b/apps/api/src/alicebot_api/semantic_retrieval.py @@ -33,6 +33,7 @@ ".text": "text/plain", ".md": "text/markdown", ".markdown": "text/markdown", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", } diff --git a/tests/integration/test_task_artifacts_api.py b/tests/integration/test_task_artifacts_api.py index d23c5c7..618bc24 100644 --- a/tests/integration/test_task_artifacts_api.py +++ b/tests/integration/test_task_artifacts_api.py @@ -1,11 +1,14 @@ from __future__ import annotations +import io import json import zlib from pathlib import Path from typing import Any from urllib.parse import urlencode from uuid import UUID, uuid4 +from xml.sax.saxutils import escape +import zipfile import anyio import psycopg @@ -99,6 +102,72 @@ def _build_pdf_bytes( return bytes(document) +def _build_docx_bytes( + paragraphs: list[str], + *, + include_document_xml: bool = True, + malformed_document_xml: bool = False, +) -> bytes: + document_xml = ( + b"' + '' + "" + + "".join( + ( + "" + f"{escape(paragraph)}" + "" + ) + for paragraph in paragraphs + ) + + ( + "" + "" + "" + "" + "" + "" + ) + ) + ) + + archive_buffer = io.BytesIO() + with zipfile.ZipFile(archive_buffer, "w", compression=zipfile.ZIP_STORED) as archive: + entries = { + "[Content_Types].xml": ( + '' + '' + '' + '' + '' + "" + ).encode("utf-8"), + "_rels/.rels": ( + '' + '' + '' + "" + ).encode("utf-8"), + } + if include_document_xml: + entries["word/document.xml"] = document_xml + + for name, payload in entries.items(): + info = zipfile.ZipInfo(filename=name) + info.date_time = (2026, 3, 13, 10, 0, 0) + info.compress_type = zipfile.ZIP_STORED + archive.writestr(info, payload) + + return archive_buffer.getvalue() + + def invoke_request( method: str, path: str, @@ -526,7 +595,8 @@ def test_task_artifact_ingestion_and_chunk_endpoints_are_deterministic_and_isola assert unsupported_ingest_payload == { "detail": ( "artifact docs/manual.bin has unsupported media type application/octet-stream; " - "supported types: text/plain, text/markdown, application/pdf" + "supported types: text/plain, text/markdown, application/pdf, " + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) } @@ -658,6 +728,133 @@ def test_task_artifact_pdf_ingestion_and_chunk_endpoints_are_deterministic_and_i } +def test_task_artifact_docx_ingestion_and_chunk_endpoints_are_deterministic_and_isolated( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + intruder = seed_task(migrated_database_urls["app"], email="intruder@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + docx_file = workspace_path / "docs" / "spec.docx" + docx_file.parent.mkdir(parents=True) + docx_file.write_bytes(_build_docx_bytes(["A" * 998, "B" * 5, "C"])) + + register_status, register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(docx_file), + "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + }, + ) + assert register_status == 201 + + ingest_status, ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + chunk_list_status, chunk_list_payload = invoke_request( + "GET", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/chunks", + query_params={"user_id": str(owner["user_id"])}, + ) + isolated_chunk_list_status, isolated_chunk_list_payload = invoke_request( + "GET", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/chunks", + query_params={"user_id": str(intruder["user_id"])}, + ) + isolated_ingest_status, isolated_ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(intruder["user_id"])}, + ) + + assert ingest_status == 200 + assert ingest_payload == { + "artifact": { + "id": register_payload["artifact"]["id"], + "task_id": str(owner["task_id"]), + "task_workspace_id": workspace_payload["workspace"]["id"], + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "docs/spec.docx", + "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "created_at": register_payload["artifact"]["created_at"], + "updated_at": ingest_payload["artifact"]["updated_at"], + }, + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + assert chunk_list_status == 200 + assert chunk_list_payload == { + "items": [ + { + "id": chunk_list_payload["items"][0]["id"], + "task_artifact_id": register_payload["artifact"]["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 1000, + "text": ("A" * 998) + "\n" + "B", + "created_at": chunk_list_payload["items"][0]["created_at"], + "updated_at": chunk_list_payload["items"][0]["updated_at"], + }, + { + "id": chunk_list_payload["items"][1]["id"], + "task_artifact_id": register_payload["artifact"]["id"], + "sequence_no": 2, + "char_start": 1000, + "char_end_exclusive": 1006, + "text": "BBBB\nC", + "created_at": chunk_list_payload["items"][1]["created_at"], + "updated_at": chunk_list_payload["items"][1]["updated_at"], + }, + ], + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + assert isolated_chunk_list_status == 404 + assert isolated_chunk_list_payload == { + "detail": f"task artifact {register_payload['artifact']['id']} was not found" + } + + assert isolated_ingest_status == 404 + assert isolated_ingest_payload == { + "detail": f"task artifact {register_payload['artifact']['id']} was not found" + } + + def test_task_artifact_ingestion_supports_markdown_and_reingest_is_idempotent( migrated_database_urls, monkeypatch, @@ -862,6 +1059,78 @@ def test_task_artifact_ingestion_rejects_textless_pdf_content( } +def test_task_artifact_ingestion_rejects_textless_or_malformed_docx( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + textless_docx = workspace_path / "docs" / "empty.docx" + textless_docx.parent.mkdir(parents=True) + textless_docx.write_bytes(_build_docx_bytes(["", ""])) + malformed_docx = workspace_path / "docs" / "broken.docx" + malformed_docx.write_bytes(_build_docx_bytes(["broken"], malformed_document_xml=True)) + + textless_register_status, textless_register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(textless_docx), + "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + }, + ) + malformed_register_status, malformed_register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(malformed_docx), + "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + }, + ) + assert textless_register_status == 201 + assert malformed_register_status == 201 + + textless_ingest_status, textless_ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{textless_register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + malformed_ingest_status, malformed_ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{malformed_register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + + assert textless_ingest_status == 400 + assert textless_ingest_payload == { + "detail": "artifact docs/empty.docx does not contain extractable DOCX text" + } + assert malformed_ingest_status == 400 + assert malformed_ingest_payload == { + "detail": "artifact docs/broken.docx is not a valid DOCX" + } + + def test_task_artifact_ingestion_enforces_rooted_workspace_paths( migrated_database_urls, monkeypatch, @@ -927,6 +1196,71 @@ def test_task_artifact_ingestion_enforces_rooted_workspace_paths( } +def test_task_artifact_docx_ingestion_enforces_rooted_workspace_paths( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + safe_file = workspace_path / "docs" / "spec.docx" + safe_file.parent.mkdir(parents=True) + safe_file.write_bytes(_build_docx_bytes(["spec"])) + outside_file = tmp_path / "escape.docx" + outside_file.write_bytes(_build_docx_bytes(["escape"])) + + register_status, register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(safe_file), + "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + }, + ) + assert register_status == 201 + + with psycopg.connect(migrated_database_urls["admin"]) as conn: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE task_artifacts + SET relative_path = '../../../escape.docx' + WHERE id = %s + """, + (register_payload["artifact"]["id"],), + ) + conn.commit() + + ingest_status, ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + + assert ingest_status == 400 + assert ingest_payload == { + "detail": f"artifact path {outside_file.resolve()} escapes workspace root {workspace_path.resolve()}" + } + + def test_task_artifact_chunk_retrieval_endpoints_are_scoped_deterministic_and_isolated( migrated_database_urls, monkeypatch, diff --git a/tests/unit/test_artifacts.py b/tests/unit/test_artifacts.py index 35da2b0..442a08d 100644 --- a/tests/unit/test_artifacts.py +++ b/tests/unit/test_artifacts.py @@ -1,9 +1,12 @@ from __future__ import annotations +import io import zlib from datetime import UTC, datetime, timedelta from pathlib import Path from uuid import UUID, uuid4 +from xml.sax.saxutils import escape +import zipfile import pytest @@ -121,6 +124,72 @@ def _build_pdf_bytes( return bytes(document) +def _build_docx_bytes( + paragraphs: list[str], + *, + include_document_xml: bool = True, + malformed_document_xml: bool = False, +) -> bytes: + document_xml = ( + b"' + '' + "" + + "".join( + ( + "" + f"{escape(paragraph)}" + "" + ) + for paragraph in paragraphs + ) + + ( + "" + "" + "" + "" + "" + "" + ) + ) + ) + + archive_buffer = io.BytesIO() + with zipfile.ZipFile(archive_buffer, "w", compression=zipfile.ZIP_STORED) as archive: + entries = { + "[Content_Types].xml": ( + '' + '' + '' + '' + '' + "" + ).encode("utf-8"), + "_rels/.rels": ( + '' + '' + '' + "" + ).encode("utf-8"), + } + if include_document_xml: + entries["word/document.xml"] = document_xml + + for name, payload in entries.items(): + info = zipfile.ZipInfo(filename=name) + info.date_time = (2026, 3, 13, 10, 0, 0) + info.compress_type = zipfile.ZIP_STORED + archive.writestr(info, payload) + + return archive_buffer.getvalue() + + class ArtifactStoreStub: def __init__(self) -> None: self.base_time = datetime(2026, 3, 13, 10, 0, tzinfo=UTC) @@ -638,6 +707,84 @@ def test_ingest_task_artifact_record_persists_deterministic_pdf_chunks(tmp_path) ] +def test_ingest_task_artifact_record_persists_deterministic_docx_chunks(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "docs" / "spec.docx" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes(_build_docx_bytes(["A" * 998, "B" * 5, "C"])) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="docs/spec.docx", + media_type_hint="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + response = ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + assert response == { + "artifact": { + "id": str(artifact["id"]), + "task_id": str(task_id), + "task_workspace_id": str(task_workspace_id), + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "docs/spec.docx", + "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "created_at": "2026-03-13T10:00:00+00:00", + "updated_at": "2026-03-13T10:30:00+00:00", + }, + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "chunking_rule": TASK_ARTIFACT_CHUNKING_RULE, + "order": ["sequence_no_asc", "id_asc"], + }, + } + assert store.locked_artifact_ids == [artifact["id"]] + assert store.list_task_artifact_chunks(artifact["id"]) == [ + { + "id": store.artifact_chunks[0]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 1000, + "text": ("A" * 998) + "\n" + "B", + "created_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + }, + { + "id": store.artifact_chunks[1]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 2, + "char_start": 1000, + "char_end_exclusive": 1006, + "text": "BBBB\nC", + "created_at": datetime(2026, 3, 13, 10, 0, 1, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, 1, tzinfo=UTC), + }, + ] + + def test_ingest_task_artifact_record_is_idempotent_for_already_ingested_artifact() -> None: store = ArtifactStoreStub() user_id = uuid4() @@ -724,7 +871,8 @@ def test_ingest_task_artifact_record_rejects_unsupported_media_type(tmp_path) -> TaskArtifactValidationError, match=( "artifact docs/spec.bin has unsupported media type application/octet-stream; " - "supported types: text/plain, text/markdown, application/pdf" + "supported types: text/plain, text/markdown, application/pdf, " + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ), ): ingest_task_artifact_record( @@ -770,6 +918,78 @@ def test_ingest_task_artifact_record_rejects_textless_pdf(tmp_path) -> None: ) +def test_ingest_task_artifact_record_rejects_textless_docx(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "docs" / "empty.docx" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes(_build_docx_bytes(["", ""])) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="docs/empty.docx", + media_type_hint="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + with pytest.raises( + TaskArtifactValidationError, + match="artifact docs/empty.docx does not contain extractable DOCX text", + ): + ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + +def test_ingest_task_artifact_record_rejects_malformed_docx(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "docs" / "broken.docx" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes(_build_docx_bytes(["broken"], malformed_document_xml=True)) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="docs/broken.docx", + media_type_hint="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + with pytest.raises( + TaskArtifactValidationError, + match="artifact docs/broken.docx is not a valid DOCX", + ): + ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + def test_ingest_task_artifact_record_rejects_invalid_utf8_content(tmp_path) -> None: store = ArtifactStoreStub() user_id = uuid4() @@ -838,6 +1058,38 @@ def test_ingest_task_artifact_record_rejects_paths_outside_workspace(tmp_path) - ) +def test_ingest_task_artifact_record_rejects_docx_paths_outside_workspace(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + outside_path = tmp_path / "escape.docx" + outside_path.write_bytes(_build_docx_bytes(["escape"])) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="../escape.docx", + media_type_hint="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + with pytest.raises(TaskArtifactValidationError, match="escapes workspace root"): + ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + def test_list_task_artifact_chunk_records_are_deterministic() -> None: store = ArtifactStoreStub() user_id = uuid4() diff --git a/tests/unit/test_artifacts_main.py b/tests/unit/test_artifacts_main.py index f43b78c..dac0244 100644 --- a/tests/unit/test_artifacts_main.py +++ b/tests/unit/test_artifacts_main.py @@ -498,7 +498,8 @@ def fake_user_connection(*_args, **_kwargs): def fake_ingest_task_artifact_record(*_args, **_kwargs): raise TaskArtifactValidationError( "artifact docs/spec.bin has unsupported media type application/octet-stream; " - "supported types: text/plain, text/markdown, application/pdf" + "supported types: text/plain, text/markdown, application/pdf, " + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) monkeypatch.setattr(main_module, "get_settings", lambda: settings) @@ -514,7 +515,8 @@ def fake_ingest_task_artifact_record(*_args, **_kwargs): assert json.loads(response.body) == { "detail": ( "artifact docs/spec.bin has unsupported media type application/octet-stream; " - "supported types: text/plain, text/markdown, application/pdf" + "supported types: text/plain, text/markdown, application/pdf, " + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) } diff --git a/tests/unit/test_semantic_retrieval.py b/tests/unit/test_semantic_retrieval.py index 7f3b26f..4404e47 100644 --- a/tests/unit/test_semantic_retrieval.py +++ b/tests/unit/test_semantic_retrieval.py @@ -461,3 +461,51 @@ def test_retrieve_artifact_scoped_semantic_artifact_chunk_records_returns_empty_ "query_vector": [0.0, 1.0, 0.0], "limit": 5, } + + +def test_retrieve_task_scoped_semantic_artifact_chunk_records_infers_docx_media_type_without_hint() -> None: + store = SemanticRetrievalStoreStub() + config_id = seed_config(store, dimensions=3) + task_id = seed_task(store) + artifact_id = seed_artifact( + store, + task_id=task_id, + relative_path="docs/spec.docx", + media_type_hint=None, + ) + docx_row = semantic_artifact_row( + store, + task_id=task_id, + task_artifact_id=artifact_id, + relative_path="docs/spec.docx", + score=0.9, + sequence_no=1, + ) + docx_row["media_type_hint"] = None + store.task_artifact_retrieval_rows = [docx_row] + + payload = retrieve_task_scoped_semantic_artifact_chunk_records( + store, # type: ignore[arg-type] + user_id=uuid4(), + request=TaskScopedSemanticArtifactChunkRetrievalInput( + task_id=task_id, + embedding_config_id=config_id, + query_vector=(1.0, 0.0, 0.0), + limit=1, + ), + ) + + assert payload["items"] == [ + { + "id": str(docx_row["id"]), + "task_id": str(task_id), + "task_artifact_id": str(artifact_id), + "relative_path": "docs/spec.docx", + "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 11, + "text": "docs/spec.docx-chunk", + "score": 0.9, + } + ]