From d29e913e6ee7d8000ac4e0547a2d31549d12a654 Mon Sep 17 00:00:00 2001 From: Sami Rusani Date: Mon, 16 Mar 2026 12:11:01 +0100 Subject: [PATCH 1/2] Sprint 5N: RFC822 email artifact parsing packet --- .ai/active/SPRINT_PACKET.md | 91 +++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/.ai/active/SPRINT_PACKET.md b/.ai/active/SPRINT_PACKET.md index eb2e402..53a0289 100644 --- a/.ai/active/SPRINT_PACKET.md +++ b/.ai/active/SPRINT_PACKET.md @@ -2,7 +2,7 @@ ## Sprint Title -Sprint 5M: DOCX Artifact Parsing V0 +Sprint 5N: RFC822 Email Artifact Parsing V0 ## Sprint Type @@ -10,15 +10,15 @@ feature ## Sprint Reason -Sprint 5L proved the richer-document-parsing seam can widen safely without changing the rooted workspace, durable chunk, retrieval, or compile contracts. The next safe slice is DOCX ingestion only, not broader PDF compatibility, OCR, connectors, or UI. +Sprint 5L and Sprint 5M proved the richer-document-parsing seam can widen safely without changing the rooted workspace, durable chunk, retrieval, or compile contracts. The next safe slice is RFC822 email ingestion only, which prepares the path for later read-only Gmail work without opening live connector, auth, or UI scope yet. ## Sprint Intent -Extend the existing artifact-ingestion seam so registered DOCX artifacts can be ingested into the existing durable `task_artifact_chunks` substrate through deterministic local text extraction, without changing retrieval contracts, compile contracts, connectors, or UI. +Extend the existing artifact-ingestion seam so registered RFC822 email artifacts can be ingested into the existing durable `task_artifact_chunks` substrate through deterministic local parsing of message headers and text bodies, without changing retrieval contracts, compile contracts, live connector scope, or UI. ## Git Instructions -- Branch Name: `codex/sprint-5m-docx-artifact-parsing-v0` +- Branch Name: `codex/sprint-5n-rfc822-email-artifact-parsing-v0` - Base Branch: `main` - PR Strategy: one sprint branch, one PR, no stacked PRs unless Control Tower explicitly opens a follow-up sprint - Merge Policy: squash merge only after reviewer `PASS` and explicit Control Tower merge approval @@ -29,100 +29,105 @@ Extend the existing artifact-ingestion seam so registered DOCX artifacts can be - Sprint 5C shipped explicit task-artifact registration. - Sprint 5D shipped deterministic local text-artifact ingestion into durable chunk rows. - Sprint 5E through 5J shipped lexical retrieval, semantic retrieval, and hybrid compile-path artifact retrieval on top of those persisted chunk rows. -- Sprint 5L extended the same ingestion seam to narrow PDF text extraction without changing retrieval or compile contracts. -- The next narrow richer-document move is a separate DOCX ingestion seam, which increases format coverage without widening into OCR, connector, or UI scope. +- Sprint 5L extended the same ingestion seam to narrow PDF text extraction. +- Sprint 5M extended the same ingestion seam to narrow DOCX text extraction. +- The next narrow richer-document move is RFC822 email parsing, which advances the Gmail-adjacent path while still staying on the existing rooted artifact and chunk substrate instead of opening a live connector. ## In Scope -- Extend schema and contracts only as narrowly needed to support DOCX ingestion metadata, for example: +- Extend schema and contracts only as narrowly needed to support RFC822 ingestion metadata, for example: - `task_artifacts.ingestion_status` reuse if no new status is required - optional deterministic extraction metadata on artifact detail or ingestion responses if needed - Define typed contracts for: - - DOCX artifact-ingestion requests if they differ from the current generic artifact-ingestion path - - artifact-ingestion responses updated for DOCX extraction metadata if needed - - artifact detail or chunk summary metadata updated for DOCX ingestion if needed + - email artifact-ingestion requests if they differ from the current generic artifact-ingestion path + - artifact-ingestion responses updated for email extraction metadata if needed + - artifact detail or chunk summary metadata updated for email ingestion if needed - Extend the existing ingestion seam so it: - - accepts already-registered visible DOCX artifacts + - accepts already-registered visible RFC822 email artifacts - resolves rooted local file paths from persisted workspace plus artifact relative path - - supports one explicit DOCX extraction path only - - extracts deterministic text from DOCX package contents without OCR or image extraction + - supports one explicit local email parsing path only + - parses deterministic text from message headers plus plain-text body parts + - handles multipart messages narrowly and predictably + - rejects unsupported body forms when no extractable text body is present - normalizes extracted text before chunking - persists ordered chunk rows into the existing `task_artifact_chunks` table - updates artifact ingestion status deterministically - Add unit and integration tests for: - - supported DOCX ingestion - - deterministic chunk ordering and chunk boundaries from extracted DOCX text - - rooted path enforcement during DOCX ingestion - - rejection of malformed or textless DOCX files when no extractable text is present + - supported RFC822 ingestion + - deterministic chunk ordering and chunk boundaries from extracted email text + - rooted path enforcement during email ingestion + - rejection of malformed or textless email artifacts when no extractable text is present - per-user isolation - stable response shape ## Out of Scope -- No broader PDF compatibility work. +- No live Gmail API or OAuth work. +- No Calendar connector scope. +- No HTML-to-text rendering beyond a narrow explicit rule if strictly needed. +- No attachment extraction. - No OCR. -- No image extraction from DOCX. - No changes to lexical retrieval contracts. - No changes to semantic retrieval contracts. - No compile contract changes. -- No Gmail or Calendar connector scope. - No runner-style orchestration. - No UI work. ## Required Deliverables -- Narrow ingestion support for visible DOCX artifacts using the existing artifact and chunk seams. -- Stable contract updates only where DOCX extraction metadata is necessary. -- Unit and integration coverage for DOCX extraction, rooted-path safety, deterministic chunk persistence, and isolation. +- Narrow ingestion support for visible RFC822 email artifacts using the existing artifact and chunk seams. +- Stable contract updates only where email extraction metadata is necessary. +- Unit and integration coverage for email extraction, rooted-path safety, deterministic chunk persistence, and isolation. - Updated `BUILD_REPORT.md` with exact verification results and explicit deferred scope. ## Acceptance Criteria -- A client can ingest one supported visible DOCX artifact into durable ordered chunk rows using the existing artifact-ingestion seam. -- DOCX ingestion reads only files rooted under the persisted task workspace boundary. -- Extracted text is normalized and chunked deterministically into the existing `task_artifact_chunks` contract. -- Malformed or textless DOCX files are rejected deterministically rather than silently producing misleading chunks. +- A client can ingest one supported visible RFC822 email artifact into durable ordered chunk rows using the existing artifact-ingestion seam. +- Email ingestion reads only files rooted under the persisted task workspace boundary. +- Extracted email text is normalized and chunked deterministically into the existing `task_artifact_chunks` contract. +- Malformed or textless email artifacts are rejected deterministically rather than silently producing misleading chunks. - Existing lexical, semantic, and hybrid artifact retrieval contracts continue to operate over the persisted chunk rows without contract changes. - `./.venv/bin/python -m pytest tests/unit` passes. - `./.venv/bin/python -m pytest tests/integration` passes. -- No PDF-compatibility expansion, OCR, connector, runner, compile-contract, or UI scope enters the sprint. +- No live Gmail connector, Calendar connector, OAuth, attachment extraction, compile-contract, runner, or UI scope enters the sprint. ## Implementation Constraints - Keep richer parsing narrow and boring. -- Reuse the existing rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams rather than creating a parallel document store. -- Support DOCX text extraction only; do not introduce OCR, image extraction, or document-layout reconstruction in the same sprint. +- Reuse the existing rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams rather than creating a parallel email store. +- Support deterministic local RFC822 parsing only; do not introduce live connector behavior in the same sprint. +- Prefer plain-text body extraction; if multipart handling is needed, keep the accepted body selection rule explicit and deterministic. - Preserve existing retrieval and compile contracts by feeding the already-shipped chunk substrate. -- Keep extraction and chunking deterministic and testable from local files alone. ## Suggested Work Breakdown -1. Define any minimal DOCX-ingestion contract updates needed. -2. Implement deterministic rooted DOCX text extraction in the existing artifact-ingestion seam. -3. Normalize extracted text and persist ordered chunk rows into the existing chunk store. -4. Add deterministic failure behavior for malformed or textless DOCX files. +1. Define any minimal RFC822-ingestion contract updates needed. +2. Implement deterministic rooted email parsing in the existing artifact-ingestion seam. +3. Normalize extracted email text and persist ordered chunk rows into the existing chunk store. +4. Add deterministic failure behavior for malformed or textless email artifacts. 5. Add unit and integration tests. 6. Update `BUILD_REPORT.md` with executed verification. ## Build Report Requirements `BUILD_REPORT.md` must include: -- the exact DOCX-ingestion contract changes introduced, if any -- the DOCX extraction path and chunking rule used +- the exact RFC822-ingestion contract changes introduced, if any +- the email extraction path and chunking rule used +- the header/body selection rule used - exact commands run - unit and integration test results -- one example DOCX artifact-ingestion response -- one example chunk list response produced from a DOCX artifact +- one example email artifact-ingestion response +- one example chunk list response produced from an email artifact - what remains intentionally deferred to later milestones ## Review Focus `REVIEW_REPORT.md` should verify: -- the sprint stayed limited to DOCX artifact parsing through the existing ingestion seam -- DOCX ingestion reuses the existing rooted workspace, artifact, and chunk contracts +- the sprint stayed limited to RFC822 email artifact parsing through the existing ingestion seam +- email ingestion reuses the existing rooted workspace, artifact, and chunk contracts - extraction determinism, chunk ordering, rooted-path safety, and isolation are test-backed -- no hidden PDF-compatibility expansion, OCR, connector, runner, compile-contract, or UI scope entered the sprint +- no hidden live Gmail connector, Calendar connector, OAuth, attachment extraction, compile-contract, runner, or UI scope entered the sprint ## Exit Condition -This sprint is complete when the repo can ingest supported visible DOCX artifacts into deterministic durable chunk rows through the existing artifact-ingestion seam, verify the full path with Postgres-backed tests, and still defer broader document parsing, connectors, and UI. +This sprint is complete when the repo can ingest supported visible RFC822 email artifacts into deterministic durable chunk rows through the existing artifact-ingestion seam, verify the full path with Postgres-backed tests, and still defer live connector work, broader email handling, and UI. From 66b8e7dad040cd087e7cdf86efc7655287e89668 Mon Sep 17 00:00:00 2001 From: Sami Rusani Date: Mon, 16 Mar 2026 13:13:05 +0100 Subject: [PATCH 2/2] Sprint 5N: RFC822 email artifact parsing v0 --- ARCHITECTURE.md | 25 +- BUILD_REPORT.md | 139 +++-- REVIEW_REPORT.md | 24 +- apps/api/src/alicebot_api/artifacts.py | 107 ++++ .../src/alicebot_api/semantic_retrieval.py | 1 + tests/integration/test_task_artifacts_api.py | 500 +++++++++++++++++- tests/unit/test_artifacts.py | 441 ++++++++++++++- tests/unit/test_artifacts_main.py | 6 +- tests/unit/test_semantic_retrieval.py | 48 ++ 9 files changed, 1206 insertions(+), 85 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index faf17a5..acb2a51 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -2,16 +2,16 @@ ## Current Implemented Slice -AliceBot now implements the accepted repo slice through Sprint 5M. The shipped backend includes: +AliceBot now implements the accepted repo slice through Sprint 5N. The shipped backend includes: - foundation continuity storage over `users`, `threads`, `sessions`, and append-only `events` - deterministic tracing and context compilation over durable continuity, memory, entity, and entity-edge records - governed memory admission, explicit-preference extraction, memory review labels, review queue reads, evaluation summary reads, explicit embedding config and memory-embedding storage, direct semantic retrieval, and deterministic hybrid compile-path memory merge - deterministic prompt assembly and one no-tools response path that persists assistant replies as immutable continuity events - user-scoped consents, policies, policy evaluation, tool registry, allowlist evaluation, tool routing, approval request persistence, approval resolution, approved-only proxy execution through the in-process `proxy.echo` handler, durable execution review, and execution-budget lifecycle plus enforcement -- durable `tasks`, `task_steps`, `task_workspaces`, `task_artifacts`, `task_artifact_chunks`, and `task_artifact_chunk_embeddings`, deterministic task-step sequencing, explicit task-step transitions, explicit manual continuation with lineage through `parent_step_id`, `source_approval_id`, and `source_execution_id`, explicit `tool_executions.task_step_id` linkage for execution synchronization, deterministic rooted local task-workspace provisioning, explicit rooted local artifact registration, deterministic local plain-text, markdown, narrow PDF text, and narrow DOCX text ingestion into durable chunk rows, deterministic lexical artifact-chunk retrieval over durable chunk rows, explicit user-scoped artifact-chunk embedding persistence tied to existing embedding configs, explicit task-scoped or artifact-scoped semantic artifact-chunk retrieval over those durable embeddings, and compile-path artifact retrieval that can include lexical results, semantic results, or one deterministic hybrid lexical-plus-semantic merged artifact section with per-chunk source provenance +- durable `tasks`, `task_steps`, `task_workspaces`, `task_artifacts`, `task_artifact_chunks`, and `task_artifact_chunk_embeddings`, deterministic task-step sequencing, explicit task-step transitions, explicit manual continuation with lineage through `parent_step_id`, `source_approval_id`, and `source_execution_id`, explicit `tool_executions.task_step_id` linkage for execution synchronization, deterministic rooted local task-workspace provisioning, explicit rooted local artifact registration, deterministic local plain-text, markdown, narrow PDF text, narrow DOCX text, and narrow RFC822 email text ingestion into durable chunk rows, deterministic lexical artifact-chunk retrieval over durable chunk rows, explicit user-scoped artifact-chunk embedding persistence tied to existing embedding configs, explicit task-scoped or artifact-scoped semantic artifact-chunk retrieval over those durable embeddings, and compile-path artifact retrieval that can include lexical results, semantic results, or one deterministic hybrid lexical-plus-semantic merged artifact section with per-chunk source provenance -The current multi-step boundary is narrow and explicit. Manual continuation is implemented and review-passed. Approval resolution and proxy execution now both use explicit task-step linkage rather than first-step inference. Task workspaces are now implemented only as deterministic rooted local boundaries, and task artifacts are now implemented only as explicit rooted local-file registrations, narrow deterministic artifact ingestion under those workspaces, lexical retrieval over persisted chunk rows, explicit artifact-chunk embedding storage tied to existing embedding configs, direct semantic retrieval over those durable artifact-chunk embeddings for one visible task or one visible artifact at a time, and compile-path artifact retrieval that deterministically merges lexical and semantic candidates into one artifact section when both are requested for the same scope. The live richer-document boundary is still intentionally narrow: plain text and markdown ingest directly, PDF support is limited to narrow local text extraction, and DOCX support is limited to narrow local text extraction from `word/document.xml`; OCR, image extraction, layout reconstruction, connectors, reranking beyond the current lexical-first hybrid merge, and new side-effect surfaces are still planned later and must not be described as live behavior. +The current multi-step boundary is narrow and explicit. Manual continuation is implemented and review-passed. Approval resolution and proxy execution now both use explicit task-step linkage rather than first-step inference. Task workspaces are now implemented only as deterministic rooted local boundaries, and task artifacts are now implemented only as explicit rooted local-file registrations, narrow deterministic artifact ingestion under those workspaces, lexical retrieval over persisted chunk rows, explicit artifact-chunk embedding storage tied to existing embedding configs, direct semantic retrieval over those durable artifact-chunk embeddings for one visible task or one visible artifact at a time, and compile-path artifact retrieval that deterministically merges lexical and semantic candidates into one artifact section when both are requested for the same scope. The live richer-document boundary is still intentionally narrow: plain text and markdown ingest directly, PDF support is limited to narrow local text extraction, DOCX support is limited to narrow local text extraction from `word/document.xml`, and RFC822 email support is limited to top-level selected headers plus extractable plain-text body content while excluding nested `message/rfc822` content; OCR, image extraction, layout reconstruction, connectors, reranking beyond the current lexical-first hybrid merge, and new side-effect surfaces are still planned later and must not be described as live behavior. ## Implemented Now @@ -62,7 +62,7 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i - `apps/web`: minimal shell only; no shipped workflow UI. - `workers`: scaffold only; no background jobs or runner logic are implemented. - `infra`: local development bootstrap assets only. -- `tests`: unit and Postgres-backed integration coverage for the shipped seams above, including Sprint 4O task-step lineage/manual continuation, Sprint 4S step-linked execution synchronization, Sprint 5A task-workspace provisioning, Sprint 5C task-artifact registration, Sprint 5D local artifact ingestion plus chunk reads, Sprint 5E lexical artifact-chunk retrieval, Sprint 5F compile-path artifact chunk integration, Sprint 5G artifact-chunk embedding persistence and reads, Sprint 5H direct semantic artifact-chunk retrieval, Sprint 5I compile-path semantic artifact retrieval, Sprint 5J deterministic hybrid lexical-plus-semantic artifact merge in compile, Sprint 5L narrow PDF artifact ingestion, and Sprint 5M narrow DOCX artifact ingestion. +- `tests`: unit and Postgres-backed integration coverage for the shipped seams above, including Sprint 4O task-step lineage/manual continuation, Sprint 4S step-linked execution synchronization, Sprint 5A task-workspace provisioning, Sprint 5C task-artifact registration, Sprint 5D local artifact ingestion plus chunk reads, Sprint 5E lexical artifact-chunk retrieval, Sprint 5F compile-path artifact chunk integration, Sprint 5G artifact-chunk embedding persistence and reads, Sprint 5H direct semantic artifact-chunk retrieval, Sprint 5I compile-path semantic artifact retrieval, Sprint 5J deterministic hybrid lexical-plus-semantic artifact merge in compile, Sprint 5L narrow PDF artifact ingestion, Sprint 5M narrow DOCX artifact ingestion, and Sprint 5N narrow RFC822 email artifact ingestion. ## Core Flows Implemented Now @@ -198,17 +198,18 @@ The current multi-step boundary is narrow and explicit. Manual continuation is i 1. Accept a user-scoped `POST /v0/task-artifacts/{task_artifact_id}/ingest` request for one visible registered artifact. 2. Lock ingestion for that artifact before deciding whether work is needed. 3. Resolve the persisted workspace `local_path` plus persisted artifact `relative_path`, and reject any rooted-path escape deterministically. -4. Support only the current narrow explicit set: `text/plain`, `text/markdown`, narrow local `application/pdf` text extraction, and narrow local `application/vnd.openxmlformats-officedocument.wordprocessingml.document` text extraction from `word/document.xml`. +4. Support only the current narrow explicit set: `text/plain`, `text/markdown`, narrow local `application/pdf` text extraction, narrow local `application/vnd.openxmlformats-officedocument.wordprocessingml.document` text extraction from `word/document.xml`, and narrow local `message/rfc822` extraction. 5. For plain text and markdown, read file bytes deterministically and require valid UTF-8 text. 6. For PDFs, extract only narrow local text content; OCR, image extraction, and broader PDF compatibility remain out of scope. 7. For DOCX, extract only narrow local text from `word/document.xml`; OCR, image extraction, headers/footers/comments expansion, and layout reconstruction remain out of scope. -8. Reject malformed or textless richer-document inputs deterministically instead of producing misleading chunks. -9. Normalize line endings by rewriting `\r\n` and `\r` to `\n`. -10. Chunk normalized text deterministically with rule `normalized_utf8_text_fixed_window_1000_chars_v1`. -11. Persist ordered `task_artifact_chunks` rows with `sequence_no`, `char_start`, `char_end_exclusive`, and `text`. -12. Update the parent artifact to `ingestion_status = ingested`. -13. If the artifact is already ingested, return the existing artifact and chunk summary without reinserting chunks. -14. `GET /v0/task-artifacts/{task_artifact_id}/chunks` returns visible chunk rows in deterministic `sequence_no ASC, id ASC` order plus stable summary metadata. +8. For RFC822 email, extract only the selected top-level headers plus extractable plain-text body content; nested `message/rfc822` content, HTML rendering, and attachment extraction remain out of scope. +9. Reject malformed or textless richer-document inputs deterministically instead of producing misleading chunks. +10. Normalize line endings by rewriting `\r\n` and `\r` to `\n`. +11. Chunk normalized text deterministically with rule `normalized_utf8_text_fixed_window_1000_chars_v1`. +12. Persist ordered `task_artifact_chunks` rows with `sequence_no`, `char_start`, `char_end_exclusive`, and `text`. +13. Update the parent artifact to `ingestion_status = ingested`. +14. If the artifact is already ingested, return the existing artifact and chunk summary without reinserting chunks. +15. `GET /v0/task-artifacts/{task_artifact_id}/chunks` returns visible chunk rows in deterministic `sequence_no ASC, id ASC` order plus stable summary metadata. ### Artifact Chunk Retrieval diff --git a/BUILD_REPORT.md b/BUILD_REPORT.md index 74a1fd5..eca9fad 100644 --- a/BUILD_REPORT.md +++ b/BUILD_REPORT.md @@ -2,69 +2,93 @@ ## sprint objective -Implement narrow DOCX artifact parsing on the existing artifact-ingestion seam so already-registered visible DOCX artifacts can be ingested into durable `task_artifact_chunks` rows without changing retrieval contracts, compile contracts, connectors, or UI. +Implement narrow RFC822 email artifact parsing on the existing artifact-ingestion seam so already-registered visible RFC822 email artifacts can be ingested into durable `task_artifact_chunks` rows without changing retrieval contracts, compile contracts, connectors, or UI. ## completed work -- Extended the existing artifact media-type support to accept DOCX artifacts: - - media type: `application/vnd.openxmlformats-officedocument.wordprocessingml.document` - - extension inference: `.docx` -- Implemented deterministic local DOCX text extraction in `apps/api/src/alicebot_api/artifacts.py` by: - - opening local DOCX bytes as a ZIP package - - reading `word/document.xml` only - - parsing WordprocessingML locally with `xml.etree.ElementTree` - - extracting paragraph text in document order from `w:t` - - preserving explicit DOCX tabs and line breaks via `w:tab`, `w:br`, and `w:cr` - - joining non-empty paragraphs with `\n` - - rejecting malformed packages/XML as invalid DOCX - - rejecting textless DOCX files when no extractable text is present +- Extended the existing artifact media-type support to accept RFC822 email artifacts: + - media type: `message/rfc822` + - extension inference: `.eml` +- Implemented deterministic local RFC822 parsing in `apps/api/src/alicebot_api/artifacts.py` by: + - parsing email bytes locally with Python's standard-library email parser under `raise_on_defect=True` + - extracting a narrow header block from top-level headers only + - extracting plain-text body content from `text/plain` leaf parts only + - excluding nested `message/rfc822` payloads from body extraction + - rejecting malformed RFC822 payloads deterministically + - rejecting textless or unsupported-body emails when no extractable plain-text body exists - Reused the existing ingestion seam after extraction: - rooted workspace path enforcement remains unchanged - normalization still runs through `normalize_artifact_text()` - chunk persistence still targets `task_artifact_chunks` - ingestion status still transitions from `pending` to `ingested` on success -- Kept retrieval/compile contracts unchanged while updating extension-based media-type inference for semantic artifact retrieval so `.docx` artifacts remain typed consistently when `media_type_hint` is absent. +- Kept request, response, schema, retrieval, and compile contracts unchanged while updating extension-based media-type inference for semantic artifact retrieval so `.eml` artifacts remain typed consistently when `media_type_hint` is absent. - Added unit coverage for: - - deterministic DOCX chunk persistence + - deterministic RFC822 chunk persistence + - multipart plain-text-part selection while ignoring HTML and attachments + - nested-email exclusion for encapsulated `message/rfc822` parts - stable unsupported-media validation text - - textless DOCX rejection - - malformed DOCX rejection - - rooted DOCX path enforcement + - textless RFC822 rejection + - malformed RFC822 rejection + - rooted RFC822 path enforcement - Added integration coverage for: - - supported DOCX ingestion with stable response shape - - deterministic DOCX chunk ordering and boundaries - - per-user isolation for DOCX ingestion/chunk listing - - textless DOCX rejection - - malformed DOCX rejection - - rooted-path enforcement during DOCX ingestion + - supported RFC822 ingestion with stable response shape + - deterministic RFC822 chunk ordering and boundaries + - per-user isolation for RFC822 ingestion and chunk listing + - nested-email exclusion for encapsulated `message/rfc822` parts + - textless RFC822 rejection + - malformed RFC822 rejection + - rooted-path enforcement during RFC822 ingestion -## exact DOCX-ingestion contract changes introduced +## exact RFC822-ingestion contract changes introduced - No request contract changes. - No response shape changes. - No schema changes. -- Existing artifact-ingestion behavior now additionally accepts `application/vnd.openxmlformats-officedocument.wordprocessingml.document`. -- Extension-based media-type inference now recognizes `.docx` for the existing artifact and semantic-retrieval response paths. +- Existing artifact-ingestion behavior now additionally accepts `message/rfc822`. +- Extension-based media-type inference now recognizes `.eml` for the existing artifact and semantic-retrieval response paths. -## DOCX extraction path and chunking rule used +## email extraction path and chunking rule used - Extraction path: - existing `POST /v0/task-artifacts/{task_artifact_id}/ingest` - resolve persisted workspace `local_path` plus persisted artifact `relative_path` - enforce rooted workspace boundary before any read - - read the local DOCX package from disk - - extract text from `word/document.xml` only - - emit paragraph-ordered text from `w:t`, `w:tab`, `w:br`, and `w:cr` - - reject invalid or textless DOCX artifacts deterministically + - read the local RFC822 email from disk + - parse it locally with the standard-library email parser + - extract a deterministic header block plus plain-text body text while excluding nested encapsulated emails + - reject invalid or textless RFC822 artifacts deterministically - Chunking rule: - normalize extracted text with CRLF/CR to LF conversion - split into fixed windows of 1000 characters - persist ordered rows with `sequence_no`, `char_start`, `char_end_exclusive`, and `text` - reported chunking rule string: `normalized_utf8_text_fixed_window_1000_chars_v1` +## header/body selection rule used + +- Header rule: + - include only these top-level headers, in this order, when present and non-empty: + - `From` + - `To` + - `Cc` + - `Bcc` + - `Reply-To` + - `Subject` + - `Date` + - `Message-ID` + - normalize header whitespace by collapsing internal whitespace runs to single spaces +- Body rule: + - recurse through multipart body structure only + - include only leaf `text/plain` parts + - skip multipart containers as extracted content + - skip parts marked as attachments + - skip parts with filenames + - skip nested descendant `message/*` parts, including `message/rfc822` + - strip each selected part and join non-empty body parts with a blank line + - reject the artifact if no extractable plain-text body part remains + ## incomplete work -- None within Sprint 5M scope. +- None within Sprint 5N scope. ## files changed @@ -72,29 +96,31 @@ Implement narrow DOCX artifact parsing on the existing artifact-ingestion seam s - `apps/api/src/alicebot_api/semantic_retrieval.py` - `tests/unit/test_artifacts.py` - `tests/unit/test_artifacts_main.py` +- `tests/unit/test_semantic_retrieval.py` - `tests/integration/test_task_artifacts_api.py` +- `ARCHITECTURE.md` - `BUILD_REPORT.md` ## tests run -- `./.venv/bin/python -m pytest tests/unit/test_artifacts.py tests/unit/test_artifacts_main.py` - - Result: `44 passed in 0.43s` +- `./.venv/bin/python -m pytest tests/unit/test_artifacts.py tests/unit/test_semantic_retrieval.py tests/unit/test_artifacts_main.py` + - Result: `59 passed in 0.31s` - `./.venv/bin/python -m pytest tests/integration/test_task_artifacts_api.py` - - Result: blocked in the sandbox because local Postgres access was denied (`psycopg.OperationalError: ... Operation not permitted`) + - Result: `15 passed in 5.08s` - `./.venv/bin/python -m pytest tests/unit` - - Result: `386 passed in 0.63s` + - Result: `394 passed in 0.61s` - `./.venv/bin/python -m pytest tests/integration` - - Result: `123 passed in 36.27s` + - Result: `127 passed in 37.01s` ## unit and integration test results - Unit suite passed in full. - Integration suite passed in full against the Postgres-backed test path. -- The DOCX-specific API coverage is included in the passing `tests/integration/test_task_artifacts_api.py` module. +- The RFC822-specific API coverage now includes nested-email exclusion and is included in the passing `tests/integration/test_task_artifacts_api.py` module. -## one example DOCX artifact-ingestion response +## one example email artifact-ingestion response -Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_deterministic_and_isolated`: +Example verified by `test_task_artifact_rfc822_ingestion_and_chunk_endpoints_are_deterministic_and_isolated`: ```json { @@ -104,24 +130,24 @@ Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_d "task_workspace_id": "", "status": "registered", "ingestion_status": "ingested", - "relative_path": "docs/spec.docx", - "media_type_hint": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "relative_path": "mail/update.eml", + "media_type_hint": "message/rfc822", "created_at": "", "updated_at": "" }, "summary": { "total_count": 2, "total_characters": 1006, - "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "media_type": "message/rfc822", "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", "order": ["sequence_no_asc", "id_asc"] } } ``` -## one example chunk list response produced from a DOCX artifact +## one example chunk list response produced from an email artifact -Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_deterministic_and_isolated`: +Example verified by `test_task_artifact_rfc822_ingestion_and_chunk_endpoints_are_deterministic_and_isolated`: ```json { @@ -132,7 +158,7 @@ Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_d "sequence_no": 1, "char_start": 0, "char_end_exclusive": 1000, - "text": "<998 times 'A'>\nB", + "text": "From: Alice \nTo: Bob \nSubject: Sprint Update\n\n<916 times 'A'>\nB", "created_at": "", "updated_at": "" }, @@ -150,7 +176,7 @@ Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_d "summary": { "total_count": 2, "total_characters": 1006, - "media_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "media_type": "message/rfc822", "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", "order": ["sequence_no_asc", "id_asc"] } @@ -160,22 +186,21 @@ Example verified by `test_task_artifact_docx_ingestion_and_chunk_endpoints_are_d ## blockers/issues - No implementation blockers remained. -- The first direct integration-test attempt from the sandbox could not reach local Postgres; rerunning the required integration suite with elevated local access succeeded. ## what remains intentionally deferred to later milestones -- broader PDF compatibility work +- live Gmail connector work +- OAuth +- Calendar connector work +- HTML-to-text rendering beyond the current explicit plain-text-only rule +- attachment extraction - OCR -- image extraction from DOCX -- document-layout reconstruction -- headers/footers/comments/track-changes-specific DOCX extraction expansion -- connector work -- runner-style orchestration - retrieval-contract changes -- semantic-contract changes +- semantic-retrieval-contract changes - compile-contract changes +- runner-style orchestration - UI work ## recommended next step -If richer document support is needed later, open a separate sprint for either broader DOCX coverage beyond `word/document.xml` or broader PDF compatibility, but keep both on the existing rooted artifact/chunk seam. +If the product needs to move from local RFC822 files to inbox access, open a separate sprint for read-only Gmail connector and auth work while keeping this extracted-text-to-chunk seam unchanged. diff --git a/REVIEW_REPORT.md b/REVIEW_REPORT.md index 00d76b3..e1ddfdb 100644 --- a/REVIEW_REPORT.md +++ b/REVIEW_REPORT.md @@ -1,27 +1,27 @@ verdict: PASS criteria met -- Sprint 5M remains within the existing artifact-ingestion seam. The runtime changes are still limited to DOCX ingestion in `apps/api/src/alicebot_api/artifacts.py` plus the narrow `.docx` media-type fallback in `apps/api/src/alicebot_api/semantic_retrieval.py`; no connector, runner, compile-contract, or UI scope entered the sprint. -- DOCX ingestion still reuses the rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams without schema or response-shape changes. -- Rooted-path safety, deterministic chunk persistence, malformed/textless DOCX rejection, and per-user isolation remain covered by the unit and Postgres-backed integration tests already reviewed. -- The previously missing regression coverage is now present in `tests/unit/test_semantic_retrieval.py`: a `.docx` artifact with `media_type_hint=None` is exercised directly, and the semantic retrieval response is asserted to infer `application/vnd.openxmlformats-officedocument.wordprocessingml.document`. -- `ARCHITECTURE.md` now matches the shipped slice: it reports scope through Sprint 5M, describes the narrow PDF and DOCX ingestion boundary accurately, and keeps OCR/image/layout work explicitly deferred. +- Sprint 5N stays within the existing artifact-ingestion seam. The runtime changes remain limited to RFC822 ingestion in `apps/api/src/alicebot_api/artifacts.py` plus `.eml` media-type inference in `apps/api/src/alicebot_api/semantic_retrieval.py`; no live Gmail, Calendar, OAuth, runner, compile-contract, or UI scope entered the sprint. +- RFC822 ingestion reuses the existing rooted `task_workspaces`, `task_artifacts`, and `task_artifact_chunks` seams without schema or response-shape changes. +- Rooted-path safety, deterministic chunk persistence, malformed/textless RFC822 rejection, per-user isolation, and stable response shapes are covered by unit and Postgres-backed integration tests. +- The prior scope bug is fixed. The RFC822 extractor no longer descends into nested `message/*` parts when selecting body text, so encapsulated `message/rfc822` payloads do not contribute persisted chunk text. +- Regression coverage for nested-email exclusion is now present in `tests/unit/test_artifacts.py` and `tests/integration/test_task_artifacts_api.py`. +- `BUILD_REPORT.md` and `ARCHITECTURE.md` now reflect Sprint 5N and describe the RFC822 boundary accurately, including exclusion of nested `message/rfc822` content. - Review verification: -- prior review verification still stands: `./.venv/bin/python -m pytest tests/unit` -> `386 passed in 0.56s` -- prior review verification still stands: `./.venv/bin/python -m pytest tests/integration` -> `123 passed in 38.04s` -- follow-up verification rerun in this review: `./.venv/bin/python -m pytest tests/unit/test_semantic_retrieval.py` -> `8 passed in 0.08s` + - `./.venv/bin/python -m pytest tests/unit` -> `394 passed in 0.64s` + - `./.venv/bin/python -m pytest tests/integration` -> `127 passed in 38.15s` criteria missed - None. quality issues -- No blocking implementation or coverage issues remain for Sprint 5M scope. +- No blocking implementation or test issues remain for Sprint 5N scope. regression risks -- No new regression risks beyond the intentionally narrow richer-document boundaries already documented in the sprint packet and architecture notes. +- Residual risk remains limited to the intentionally narrow richer-document boundary already documented in the sprint packet and architecture notes: HTML rendering, attachment extraction, and live connector behavior are still deferred. docs issues -- None. `BUILD_REPORT.md` and `ARCHITECTURE.md` are consistent with the implemented slice and the review expectations. +- None. `BUILD_REPORT.md` and `ARCHITECTURE.md` are consistent with the implemented slice and the corrected RFC822 extraction rule. should anything be added to RULES.md? - No. @@ -30,4 +30,4 @@ should anything update ARCHITECTURE.md? - No further update is required for this sprint. recommended next action -- Accept Sprint 5M as complete and merge after normal approval flow. +- Accept Sprint 5N as complete and merge after normal approval flow. diff --git a/apps/api/src/alicebot_api/artifacts.py b/apps/api/src/alicebot_api/artifacts.py index 734f251..72bab4f 100644 --- a/apps/api/src/alicebot_api/artifacts.py +++ b/apps/api/src/alicebot_api/artifacts.py @@ -1,6 +1,10 @@ from __future__ import annotations import io +from email import policy +from email.errors import MessageDefect, MessageError +from email.message import EmailMessage +from email.parser import BytesParser import re import zlib from dataclasses import dataclass @@ -46,10 +50,12 @@ SUPPORTED_DOCX_ARTIFACT_MEDIA_TYPE = ( "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) +SUPPORTED_RFC822_ARTIFACT_MEDIA_TYPE = "message/rfc822" SUPPORTED_ARTIFACT_MEDIA_TYPES = ( *SUPPORTED_TEXT_ARTIFACT_MEDIA_TYPES, SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE, SUPPORTED_DOCX_ARTIFACT_MEDIA_TYPE, + SUPPORTED_RFC822_ARTIFACT_MEDIA_TYPE, ) SUPPORTED_ARTIFACT_EXTENSIONS = { ".txt": "text/plain", @@ -58,6 +64,7 @@ ".markdown": "text/markdown", ".pdf": SUPPORTED_PDF_ARTIFACT_MEDIA_TYPE, ".docx": SUPPORTED_DOCX_ARTIFACT_MEDIA_TYPE, + ".eml": SUPPORTED_RFC822_ARTIFACT_MEDIA_TYPE, } TASK_ARTIFACT_CHUNK_MAX_CHARS = 1000 TASK_ARTIFACT_CHUNKING_RULE = "normalized_utf8_text_fixed_window_1000_chars_v1" @@ -148,6 +155,17 @@ _DOCX_BREAK_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}br" _DOCX_CARRIAGE_RETURN_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}cr" _DOCX_BODY_TAG = f"{{{_DOCX_WORDPROCESSING_NAMESPACE}}}body" +_RFC822_EMAIL_PARSE_POLICY = policy.default.clone(raise_on_defect=True) +_RFC822_EXTRACTED_HEADER_NAMES = ( + "From", + "To", + "Cc", + "Bcc", + "Reply-To", + "Subject", + "Date", + "Message-ID", +) @dataclass(frozen=True, slots=True) @@ -327,6 +345,90 @@ def _extract_text_from_docx_artifact_bytes(*, relative_path: str, payload: bytes return extracted_text +def _normalize_rfc822_header_value(value: str) -> str: + return re.sub(r"\s+", " ", value).strip() + + +def _parse_rfc822_email(*, relative_path: str, payload: bytes) -> EmailMessage: + try: + message = BytesParser(policy=_RFC822_EMAIL_PARSE_POLICY).parsebytes(payload) + except (MessageDefect, MessageError, ValueError, TypeError) as exc: + raise TaskArtifactValidationError( + f"artifact {relative_path} is not a valid RFC822 email" + ) from exc + return cast(EmailMessage, message) + + +def _extract_rfc822_header_lines(message: EmailMessage) -> list[str]: + header_lines: list[str] = [] + for header_name in _RFC822_EXTRACTED_HEADER_NAMES: + for header_value in message.get_all(header_name, failobj=[]): + normalized_value = _normalize_rfc822_header_value(str(header_value)) + if normalized_value != "": + header_lines.append(f"{header_name}: {normalized_value}") + return header_lines + + +def _is_extractable_rfc822_text_part(part: EmailMessage) -> bool: + if part.is_multipart(): + return False + if part.get_content_type() != "text/plain": + return False + if part.get_content_disposition() == "attachment": + return False + return part.get_filename() is None + + +def _extract_rfc822_part_text(*, relative_path: str, part: EmailMessage) -> str: + try: + payload = part.get_content() + except (MessageError, LookupError, UnicodeError, ValueError, TypeError) as exc: + raise TaskArtifactValidationError( + f"artifact {relative_path} is not a valid RFC822 email" + ) from exc + if not isinstance(payload, str): + raise TaskArtifactValidationError( + f"artifact {relative_path} is not a valid RFC822 email" + ) + return payload.strip() + + +def _iter_extractable_rfc822_text_parts(message: EmailMessage) -> list[EmailMessage]: + if _is_extractable_rfc822_text_part(message): + return [message] + if not message.is_multipart(): + return [] + + extractable_parts: list[EmailMessage] = [] + for child_part in message.iter_parts(): + child_email_part = cast(EmailMessage, child_part) + if child_email_part.get_content_maintype() == "message": + continue + extractable_parts.extend(_iter_extractable_rfc822_text_parts(child_email_part)) + return extractable_parts + + +def _extract_text_from_rfc822_artifact_bytes(*, relative_path: str, payload: bytes) -> str: + message = _parse_rfc822_email(relative_path=relative_path, payload=payload) + header_lines = _extract_rfc822_header_lines(message) + body_parts = [ + body_text + for part in _iter_extractable_rfc822_text_parts(message) + if (body_text := _extract_rfc822_part_text(relative_path=relative_path, part=part)) + != "" + ] + if not body_parts: + raise TaskArtifactValidationError( + f"artifact {relative_path} does not contain extractable RFC822 email text" + ) + + sections: list[str] = [] + if header_lines: + sections.append("\n".join(header_lines)) + sections.append("\n\n".join(body_parts)) + return "\n\n".join(sections) + + def _extract_pdf_name(dictionary: bytes, key: bytes) -> bytes | None: match = re.search(rb"/" + re.escape(key) + rb"\s*/([A-Za-z0-9_.#-]+)", dictionary) if match is None: @@ -806,6 +908,11 @@ def extract_artifact_text(*, row: TaskArtifactRow, artifact_path: Path, media_ty relative_path=row["relative_path"], payload=payload, ) + if media_type == SUPPORTED_RFC822_ARTIFACT_MEDIA_TYPE: + return _extract_text_from_rfc822_artifact_bytes( + relative_path=row["relative_path"], + payload=payload, + ) raise TaskArtifactValidationError( f"artifact {row['relative_path']} has unsupported media type {media_type}" ) diff --git a/apps/api/src/alicebot_api/semantic_retrieval.py b/apps/api/src/alicebot_api/semantic_retrieval.py index 5145062..16e9c85 100644 --- a/apps/api/src/alicebot_api/semantic_retrieval.py +++ b/apps/api/src/alicebot_api/semantic_retrieval.py @@ -34,6 +34,7 @@ ".md": "text/markdown", ".markdown": "text/markdown", ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".eml": "message/rfc822", } diff --git a/tests/integration/test_task_artifacts_api.py b/tests/integration/test_task_artifacts_api.py index 618bc24..2c61dd3 100644 --- a/tests/integration/test_task_artifacts_api.py +++ b/tests/integration/test_task_artifacts_api.py @@ -168,6 +168,116 @@ def _build_docx_bytes( return archive_buffer.getvalue() +def _build_rfc822_email_bytes( + *, + headers: list[tuple[str, str]] | None = None, + plain_body: str | None = None, + plain_parts: list[str] | None = None, + html_body: str | None = None, + attachment_text: str | None = None, + nested_message_bytes: bytes | None = None, + malformed_multipart: bool = False, +) -> bytes: + header_lines = [ + f"{name}: {value}" + for name, value in ( + headers + if headers is not None + else [ + ("From", "Alice "), + ("To", "Bob "), + ("Subject", "Sprint Update"), + ] + ) + ] + if malformed_multipart: + return ( + "\r\n".join( + [ + *header_lines, + "MIME-Version: 1.0", + "Content-Type: multipart/mixed", + "", + "--broken-boundary", + 'Content-Type: text/plain; charset="utf-8"', + "", + "broken", + "--broken-boundary--", + "", + ] + ).encode("utf-8") + ) + + if ( + plain_parts is None + and html_body is None + and attachment_text is None + and nested_message_bytes is None + ): + return ( + "\r\n".join( + [ + *header_lines, + 'Content-Type: text/plain; charset="utf-8"', + "Content-Transfer-Encoding: 8bit", + "", + plain_body or "", + ] + ).encode("utf-8") + ) + + boundary = "alicebot-boundary-001" + lines = [ + *header_lines, + "MIME-Version: 1.0", + f'Content-Type: multipart/mixed; boundary="{boundary}"', + "", + ] + for part_text in plain_parts or []: + lines.extend( + [ + f"--{boundary}", + 'Content-Type: text/plain; charset="utf-8"', + "Content-Transfer-Encoding: 8bit", + "", + part_text, + ] + ) + if html_body is not None: + lines.extend( + [ + f"--{boundary}", + 'Content-Type: text/html; charset="utf-8"', + "Content-Transfer-Encoding: 8bit", + "", + html_body, + ] + ) + if attachment_text is not None: + lines.extend( + [ + f"--{boundary}", + 'Content-Type: text/plain; charset="utf-8"', + 'Content-Disposition: attachment; filename="note.txt"', + "Content-Transfer-Encoding: 8bit", + "", + attachment_text, + ] + ) + if nested_message_bytes is not None: + lines.extend( + [ + f"--{boundary}", + "Content-Type: message/rfc822", + "Content-Transfer-Encoding: 8bit", + "", + nested_message_bytes.decode("utf-8"), + ] + ) + lines.extend([f"--{boundary}--", ""]) + return "\r\n".join(lines).encode("utf-8") + + def invoke_request( method: str, path: str, @@ -596,7 +706,8 @@ def test_task_artifact_ingestion_and_chunk_endpoints_are_deterministic_and_isola "detail": ( "artifact docs/manual.bin has unsupported media type application/octet-stream; " "supported types: text/plain, text/markdown, application/pdf, " - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + "application/vnd.openxmlformats-officedocument.wordprocessingml.document, " + "message/rfc822" ) } @@ -855,6 +966,256 @@ def test_task_artifact_docx_ingestion_and_chunk_endpoints_are_deterministic_and_ } +def test_task_artifact_rfc822_ingestion_and_chunk_endpoints_are_deterministic_and_isolated( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + intruder = seed_task(migrated_database_urls["app"], email="intruder@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + email_file = workspace_path / "mail" / "update.eml" + email_file.parent.mkdir(parents=True) + email_file.write_bytes( + _build_rfc822_email_bytes( + plain_body=("A" * 916) + "\r\n" + ("B" * 5) + "\rC", + ) + ) + + register_status, register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(email_file), + "media_type_hint": "message/rfc822", + }, + ) + assert register_status == 201 + + ingest_status, ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + chunk_list_status, chunk_list_payload = invoke_request( + "GET", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/chunks", + query_params={"user_id": str(owner["user_id"])}, + ) + isolated_chunk_list_status, isolated_chunk_list_payload = invoke_request( + "GET", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/chunks", + query_params={"user_id": str(intruder["user_id"])}, + ) + isolated_ingest_status, isolated_ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(intruder["user_id"])}, + ) + + header_block = ( + "From: Alice \n" + "To: Bob \n" + "Subject: Sprint Update\n\n" + ) + assert ingest_status == 200 + assert ingest_payload == { + "artifact": { + "id": register_payload["artifact"]["id"], + "task_id": str(owner["task_id"]), + "task_workspace_id": workspace_payload["workspace"]["id"], + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "mail/update.eml", + "media_type_hint": "message/rfc822", + "created_at": register_payload["artifact"]["created_at"], + "updated_at": ingest_payload["artifact"]["updated_at"], + }, + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "message/rfc822", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + assert chunk_list_status == 200 + assert chunk_list_payload == { + "items": [ + { + "id": chunk_list_payload["items"][0]["id"], + "task_artifact_id": register_payload["artifact"]["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 1000, + "text": header_block + ("A" * 916) + "\n" + "B", + "created_at": chunk_list_payload["items"][0]["created_at"], + "updated_at": chunk_list_payload["items"][0]["updated_at"], + }, + { + "id": chunk_list_payload["items"][1]["id"], + "task_artifact_id": register_payload["artifact"]["id"], + "sequence_no": 2, + "char_start": 1000, + "char_end_exclusive": 1006, + "text": "BBBB\nC", + "created_at": chunk_list_payload["items"][1]["created_at"], + "updated_at": chunk_list_payload["items"][1]["updated_at"], + }, + ], + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "message/rfc822", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + assert isolated_chunk_list_status == 404 + assert isolated_chunk_list_payload == { + "detail": f"task artifact {register_payload['artifact']['id']} was not found" + } + + assert isolated_ingest_status == 404 + assert isolated_ingest_payload == { + "detail": f"task artifact {register_payload['artifact']['id']} was not found" + } + + +def test_task_artifact_rfc822_ingestion_excludes_nested_email_bodies( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + email_file = workspace_path / "mail" / "forwarded.eml" + email_file.parent.mkdir(parents=True) + email_file.write_bytes( + _build_rfc822_email_bytes( + plain_parts=["Outer body"], + nested_message_bytes=_build_rfc822_email_bytes( + headers=[ + ("From", "Nested "), + ("To", "Team "), + ("Subject", "Nested"), + ], + plain_body="Inner body", + ), + ) + ) + + register_status, register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(email_file), + "media_type_hint": "message/rfc822", + }, + ) + assert register_status == 201 + + ingest_status, ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + chunk_list_status, chunk_list_payload = invoke_request( + "GET", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/chunks", + query_params={"user_id": str(owner["user_id"])}, + ) + + expected_text = ( + "From: Alice \n" + "To: Bob \n" + "Subject: Sprint Update\n\n" + "Outer body" + ) + assert ingest_status == 200 + assert ingest_payload == { + "artifact": { + "id": register_payload["artifact"]["id"], + "task_id": str(owner["task_id"]), + "task_workspace_id": workspace_payload["workspace"]["id"], + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "mail/forwarded.eml", + "media_type_hint": "message/rfc822", + "created_at": register_payload["artifact"]["created_at"], + "updated_at": ingest_payload["artifact"]["updated_at"], + }, + "summary": { + "total_count": 1, + "total_characters": len(expected_text), + "media_type": "message/rfc822", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + assert chunk_list_status == 200 + assert chunk_list_payload == { + "items": [ + { + "id": chunk_list_payload["items"][0]["id"], + "task_artifact_id": register_payload["artifact"]["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": len(expected_text), + "text": expected_text, + "created_at": chunk_list_payload["items"][0]["created_at"], + "updated_at": chunk_list_payload["items"][0]["updated_at"], + } + ], + "summary": { + "total_count": 1, + "total_characters": len(expected_text), + "media_type": "message/rfc822", + "chunking_rule": "normalized_utf8_text_fixed_window_1000_chars_v1", + "order": ["sequence_no_asc", "id_asc"], + }, + } + + def test_task_artifact_ingestion_supports_markdown_and_reingest_is_idempotent( migrated_database_urls, monkeypatch, @@ -1131,6 +1492,78 @@ def test_task_artifact_ingestion_rejects_textless_or_malformed_docx( } +def test_task_artifact_ingestion_rejects_textless_or_malformed_rfc822_email( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + textless_email = workspace_path / "mail" / "empty.eml" + textless_email.parent.mkdir(parents=True) + textless_email.write_bytes(_build_rfc822_email_bytes(html_body="

html only

")) + malformed_email = workspace_path / "mail" / "broken.eml" + malformed_email.write_bytes(_build_rfc822_email_bytes(malformed_multipart=True)) + + textless_register_status, textless_register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(textless_email), + "media_type_hint": "message/rfc822", + }, + ) + malformed_register_status, malformed_register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(malformed_email), + "media_type_hint": "message/rfc822", + }, + ) + assert textless_register_status == 201 + assert malformed_register_status == 201 + + textless_ingest_status, textless_ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{textless_register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + malformed_ingest_status, malformed_ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{malformed_register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + + assert textless_ingest_status == 400 + assert textless_ingest_payload == { + "detail": "artifact mail/empty.eml does not contain extractable RFC822 email text" + } + assert malformed_ingest_status == 400 + assert malformed_ingest_payload == { + "detail": "artifact mail/broken.eml is not a valid RFC822 email" + } + + def test_task_artifact_ingestion_enforces_rooted_workspace_paths( migrated_database_urls, monkeypatch, @@ -1261,6 +1694,71 @@ def test_task_artifact_docx_ingestion_enforces_rooted_workspace_paths( } +def test_task_artifact_rfc822_ingestion_enforces_rooted_workspace_paths( + migrated_database_urls, + monkeypatch, + tmp_path, +) -> None: + owner = seed_task(migrated_database_urls["app"], email="owner@example.com") + workspace_root = tmp_path / "task-workspaces" + monkeypatch.setattr( + main_module, + "get_settings", + lambda: Settings( + database_url=migrated_database_urls["app"], + task_workspace_root=str(workspace_root), + ), + ) + + workspace_status, workspace_payload = invoke_request( + "POST", + f"/v0/tasks/{owner['task_id']}/workspace", + payload={"user_id": str(owner["user_id"])}, + ) + assert workspace_status == 201 + + workspace_path = Path(workspace_payload["workspace"]["local_path"]) + safe_file = workspace_path / "mail" / "update.eml" + safe_file.parent.mkdir(parents=True) + safe_file.write_bytes(_build_rfc822_email_bytes(plain_body="spec")) + outside_file = tmp_path / "escape.eml" + outside_file.write_bytes(_build_rfc822_email_bytes(plain_body="escape")) + + register_status, register_payload = invoke_request( + "POST", + f"/v0/task-workspaces/{workspace_payload['workspace']['id']}/artifacts", + payload={ + "user_id": str(owner["user_id"]), + "local_path": str(safe_file), + "media_type_hint": "message/rfc822", + }, + ) + assert register_status == 201 + + with psycopg.connect(migrated_database_urls["admin"]) as conn: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE task_artifacts + SET relative_path = '../../../escape.eml' + WHERE id = %s + """, + (register_payload["artifact"]["id"],), + ) + conn.commit() + + ingest_status, ingest_payload = invoke_request( + "POST", + f"/v0/task-artifacts/{register_payload['artifact']['id']}/ingest", + payload={"user_id": str(owner["user_id"])}, + ) + + assert ingest_status == 400 + assert ingest_payload == { + "detail": f"artifact path {outside_file.resolve()} escapes workspace root {workspace_path.resolve()}" + } + + def test_task_artifact_chunk_retrieval_endpoints_are_scoped_deterministic_and_isolated( migrated_database_urls, monkeypatch, diff --git a/tests/unit/test_artifacts.py b/tests/unit/test_artifacts.py index 442a08d..df3de49 100644 --- a/tests/unit/test_artifacts.py +++ b/tests/unit/test_artifacts.py @@ -190,6 +190,116 @@ def _build_docx_bytes( return archive_buffer.getvalue() +def _build_rfc822_email_bytes( + *, + headers: list[tuple[str, str]] | None = None, + plain_body: str | None = None, + plain_parts: list[str] | None = None, + html_body: str | None = None, + attachment_text: str | None = None, + nested_message_bytes: bytes | None = None, + malformed_multipart: bool = False, +) -> bytes: + header_lines = [ + f"{name}: {value}" + for name, value in ( + headers + if headers is not None + else [ + ("From", "Alice "), + ("To", "Bob "), + ("Subject", "Sprint Update"), + ] + ) + ] + if malformed_multipart: + return ( + "\r\n".join( + [ + *header_lines, + "MIME-Version: 1.0", + "Content-Type: multipart/mixed", + "", + "--broken-boundary", + 'Content-Type: text/plain; charset="utf-8"', + "", + "broken", + "--broken-boundary--", + "", + ] + ).encode("utf-8") + ) + + if ( + plain_parts is None + and html_body is None + and attachment_text is None + and nested_message_bytes is None + ): + return ( + "\r\n".join( + [ + *header_lines, + 'Content-Type: text/plain; charset="utf-8"', + "Content-Transfer-Encoding: 8bit", + "", + plain_body or "", + ] + ).encode("utf-8") + ) + + boundary = "alicebot-boundary-001" + lines = [ + *header_lines, + "MIME-Version: 1.0", + f'Content-Type: multipart/mixed; boundary="{boundary}"', + "", + ] + for part_text in plain_parts or []: + lines.extend( + [ + f"--{boundary}", + 'Content-Type: text/plain; charset="utf-8"', + "Content-Transfer-Encoding: 8bit", + "", + part_text, + ] + ) + if html_body is not None: + lines.extend( + [ + f"--{boundary}", + 'Content-Type: text/html; charset="utf-8"', + "Content-Transfer-Encoding: 8bit", + "", + html_body, + ] + ) + if attachment_text is not None: + lines.extend( + [ + f"--{boundary}", + 'Content-Type: text/plain; charset="utf-8"', + 'Content-Disposition: attachment; filename="note.txt"', + "Content-Transfer-Encoding: 8bit", + "", + attachment_text, + ] + ) + if nested_message_bytes is not None: + lines.extend( + [ + f"--{boundary}", + "Content-Type: message/rfc822", + "Content-Transfer-Encoding: 8bit", + "", + nested_message_bytes.decode("utf-8"), + ] + ) + lines.extend([f"--{boundary}--", ""]) + return "\r\n".join(lines).encode("utf-8") + + class ArtifactStoreStub: def __init__(self) -> None: self.base_time = datetime(2026, 3, 13, 10, 0, tzinfo=UTC) @@ -785,6 +895,230 @@ def test_ingest_task_artifact_record_persists_deterministic_docx_chunks(tmp_path ] +def test_ingest_task_artifact_record_persists_deterministic_rfc822_chunks(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "mail" / "update.eml" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes( + _build_rfc822_email_bytes( + plain_body=("A" * 916) + "\r\n" + ("B" * 5) + "\rC", + ) + ) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="mail/update.eml", + media_type_hint="message/rfc822", + ) + + response = ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + header_block = ( + "From: Alice \n" + "To: Bob \n" + "Subject: Sprint Update\n\n" + ) + assert response == { + "artifact": { + "id": str(artifact["id"]), + "task_id": str(task_id), + "task_workspace_id": str(task_workspace_id), + "status": "registered", + "ingestion_status": "ingested", + "relative_path": "mail/update.eml", + "media_type_hint": "message/rfc822", + "created_at": "2026-03-13T10:00:00+00:00", + "updated_at": "2026-03-13T10:30:00+00:00", + }, + "summary": { + "total_count": 2, + "total_characters": 1006, + "media_type": "message/rfc822", + "chunking_rule": TASK_ARTIFACT_CHUNKING_RULE, + "order": ["sequence_no_asc", "id_asc"], + }, + } + assert store.locked_artifact_ids == [artifact["id"]] + assert store.list_task_artifact_chunks(artifact["id"]) == [ + { + "id": store.artifact_chunks[0]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 1000, + "text": header_block + ("A" * 916) + "\n" + "B", + "created_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + }, + { + "id": store.artifact_chunks[1]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 2, + "char_start": 1000, + "char_end_exclusive": 1006, + "text": "BBBB\nC", + "created_at": datetime(2026, 3, 13, 10, 0, 1, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, 1, tzinfo=UTC), + }, + ] + + +def test_ingest_task_artifact_record_extracts_plain_text_parts_from_multipart_rfc822_email( + tmp_path, +) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "mail" / "multipart.eml" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes( + _build_rfc822_email_bytes( + plain_parts=["Alpha\r\nBeta", "Gamma"], + html_body="

ignored

", + attachment_text="ignored attachment", + ) + ) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="mail/multipart.eml", + media_type_hint="message/rfc822", + ) + + response = ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + assert response["summary"] == { + "total_count": 1, + "total_characters": 99, + "media_type": "message/rfc822", + "chunking_rule": TASK_ARTIFACT_CHUNKING_RULE, + "order": ["sequence_no_asc", "id_asc"], + } + assert store.list_task_artifact_chunks(artifact["id"]) == [ + { + "id": store.artifact_chunks[0]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 99, + "text": ( + "From: Alice \n" + "To: Bob \n" + "Subject: Sprint Update\n\n" + "Alpha\nBeta\n\nGamma" + ), + "created_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + } + ] + + +def test_ingest_task_artifact_record_excludes_nested_rfc822_message_bodies(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "mail" / "forwarded.eml" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes( + _build_rfc822_email_bytes( + plain_parts=["Outer body"], + nested_message_bytes=_build_rfc822_email_bytes( + headers=[ + ("From", "Nested "), + ("To", "Team "), + ("Subject", "Nested"), + ], + plain_body="Inner body", + ), + ) + ) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="mail/forwarded.eml", + media_type_hint="message/rfc822", + ) + + response = ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + expected_text = ( + "From: Alice \n" + "To: Bob \n" + "Subject: Sprint Update\n\n" + "Outer body" + ) + assert response["summary"] == { + "total_count": 1, + "total_characters": len(expected_text), + "media_type": "message/rfc822", + "chunking_rule": TASK_ARTIFACT_CHUNKING_RULE, + "order": ["sequence_no_asc", "id_asc"], + } + assert store.list_task_artifact_chunks(artifact["id"]) == [ + { + "id": store.artifact_chunks[0]["id"], + "user_id": user_id, + "task_artifact_id": artifact["id"], + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": len(expected_text), + "text": expected_text, + "created_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + "updated_at": datetime(2026, 3, 13, 10, 0, tzinfo=UTC), + } + ] + + def test_ingest_task_artifact_record_is_idempotent_for_already_ingested_artifact() -> None: store = ArtifactStoreStub() user_id = uuid4() @@ -872,7 +1206,8 @@ def test_ingest_task_artifact_record_rejects_unsupported_media_type(tmp_path) -> match=( "artifact docs/spec.bin has unsupported media type application/octet-stream; " "supported types: text/plain, text/markdown, application/pdf, " - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + "application/vnd.openxmlformats-officedocument.wordprocessingml.document, " + "message/rfc822" ), ): ingest_task_artifact_record( @@ -990,6 +1325,78 @@ def test_ingest_task_artifact_record_rejects_malformed_docx(tmp_path) -> None: ) +def test_ingest_task_artifact_record_rejects_textless_rfc822_email(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "mail" / "empty.eml" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes(_build_rfc822_email_bytes(html_body="

html only

")) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="mail/empty.eml", + media_type_hint="message/rfc822", + ) + + with pytest.raises( + TaskArtifactValidationError, + match="artifact mail/empty.eml does not contain extractable RFC822 email text", + ): + ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + +def test_ingest_task_artifact_record_rejects_malformed_rfc822_email(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + artifact_path = workspace_path / "mail" / "broken.eml" + artifact_path.parent.mkdir(parents=True) + artifact_path.write_bytes(_build_rfc822_email_bytes(malformed_multipart=True)) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="mail/broken.eml", + media_type_hint="message/rfc822", + ) + + with pytest.raises( + TaskArtifactValidationError, + match="artifact mail/broken.eml is not a valid RFC822 email", + ): + ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + def test_ingest_task_artifact_record_rejects_invalid_utf8_content(tmp_path) -> None: store = ArtifactStoreStub() user_id = uuid4() @@ -1090,6 +1497,38 @@ def test_ingest_task_artifact_record_rejects_docx_paths_outside_workspace(tmp_pa ) +def test_ingest_task_artifact_record_rejects_rfc822_paths_outside_workspace(tmp_path) -> None: + store = ArtifactStoreStub() + user_id = uuid4() + task_id = uuid4() + task_workspace_id = uuid4() + workspace_path = tmp_path / "workspaces" / str(user_id) / str(task_id) + workspace_path.mkdir(parents=True) + outside_path = tmp_path / "escape.eml" + outside_path.write_bytes(_build_rfc822_email_bytes(plain_body="escape")) + store.create_task_workspace( + task_workspace_id=task_workspace_id, + task_id=task_id, + user_id=user_id, + local_path=str(workspace_path), + ) + artifact = store.create_task_artifact( + task_id=task_id, + task_workspace_id=task_workspace_id, + status="registered", + ingestion_status="pending", + relative_path="../escape.eml", + media_type_hint="message/rfc822", + ) + + with pytest.raises(TaskArtifactValidationError, match="escapes workspace root"): + ingest_task_artifact_record( + store, + user_id=user_id, + request=TaskArtifactIngestInput(task_artifact_id=artifact["id"]), + ) + + def test_list_task_artifact_chunk_records_are_deterministic() -> None: store = ArtifactStoreStub() user_id = uuid4() diff --git a/tests/unit/test_artifacts_main.py b/tests/unit/test_artifacts_main.py index dac0244..439d47f 100644 --- a/tests/unit/test_artifacts_main.py +++ b/tests/unit/test_artifacts_main.py @@ -499,7 +499,8 @@ def fake_ingest_task_artifact_record(*_args, **_kwargs): raise TaskArtifactValidationError( "artifact docs/spec.bin has unsupported media type application/octet-stream; " "supported types: text/plain, text/markdown, application/pdf, " - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + "application/vnd.openxmlformats-officedocument.wordprocessingml.document, " + "message/rfc822" ) monkeypatch.setattr(main_module, "get_settings", lambda: settings) @@ -516,7 +517,8 @@ def fake_ingest_task_artifact_record(*_args, **_kwargs): "detail": ( "artifact docs/spec.bin has unsupported media type application/octet-stream; " "supported types: text/plain, text/markdown, application/pdf, " - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + "application/vnd.openxmlformats-officedocument.wordprocessingml.document, " + "message/rfc822" ) } diff --git a/tests/unit/test_semantic_retrieval.py b/tests/unit/test_semantic_retrieval.py index 4404e47..3a3058a 100644 --- a/tests/unit/test_semantic_retrieval.py +++ b/tests/unit/test_semantic_retrieval.py @@ -509,3 +509,51 @@ def test_retrieve_task_scoped_semantic_artifact_chunk_records_infers_docx_media_ "score": 0.9, } ] + + +def test_retrieve_task_scoped_semantic_artifact_chunk_records_infers_rfc822_media_type_without_hint() -> None: + store = SemanticRetrievalStoreStub() + config_id = seed_config(store, dimensions=3) + task_id = seed_task(store) + artifact_id = seed_artifact( + store, + task_id=task_id, + relative_path="mail/update.eml", + media_type_hint=None, + ) + email_row = semantic_artifact_row( + store, + task_id=task_id, + task_artifact_id=artifact_id, + relative_path="mail/update.eml", + score=0.85, + sequence_no=1, + ) + email_row["media_type_hint"] = None + store.task_artifact_retrieval_rows = [email_row] + + payload = retrieve_task_scoped_semantic_artifact_chunk_records( + store, # type: ignore[arg-type] + user_id=uuid4(), + request=TaskScopedSemanticArtifactChunkRetrievalInput( + task_id=task_id, + embedding_config_id=config_id, + query_vector=(1.0, 0.0, 0.0), + limit=1, + ), + ) + + assert payload["items"] == [ + { + "id": str(email_row["id"]), + "task_id": str(task_id), + "task_artifact_id": str(artifact_id), + "relative_path": "mail/update.eml", + "media_type": "message/rfc822", + "sequence_no": 1, + "char_start": 0, + "char_end_exclusive": 11, + "text": "mail/update.eml-chunk", + "score": 0.85, + } + ]