Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions libs/extractor-api-lib/tests/modality_contract_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Contract tests for modality metadata consistency across extraction paths."""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import pytest


CONTRACT_DIR = Path(__file__).parent / "test_data" / "modality_contract"
ALLOWED_MODALITIES = {"TEXT", "TABLE", "IMAGE"}


def _load_fixture(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as handle:
return json.load(handle)


def _validate_piece(piece: dict[str, Any]) -> list[str]:
errors: list[str] = []

modality = piece.get("type")
metadata = piece.get("metadata")
page_content = piece.get("page_content")

if modality not in ALLOWED_MODALITIES:
errors.append("type must be one of TEXT/TABLE/IMAGE")

if not isinstance(page_content, str):
errors.append("page_content must be a string")

if not isinstance(metadata, dict):
errors.append("metadata must be an object")
return errors

required_common = ("document", "page", "id", "related")
for key in required_common:
if key not in metadata:
errors.append(f'metadata missing required key: "{key}"')

if "related" in metadata and not isinstance(metadata.get("related"), list):
errors.append('metadata["related"] must be a list')

if modality == "IMAGE":
has_legacy = bool(metadata.get("base64_image"))
has_image_url = bool(metadata.get("image_url"))
has_image_ref = bool(metadata.get("image_ref"))
has_reference = has_image_url or has_image_ref

if not (has_legacy or has_reference):
errors.append("IMAGE metadata must contain base64_image or image_url/image_ref")

if has_reference and not metadata.get("image_mime"):
errors.append("IMAGE metadata with image_url/image_ref must include image_mime")

return errors


@pytest.mark.parametrize(
"fixture_path",
sorted(CONTRACT_DIR.glob("*.json")),
ids=lambda p: p.stem,
)
def test_modality_contract_fixture(fixture_path: Path):
"""Validate every contract fixture against the shared modality schema."""
fixture = _load_fixture(fixture_path)
piece = fixture["piece"]
valid = bool(fixture["valid"])

errors = _validate_piece(piece)

if valid:
assert errors == [], f"Fixture {fixture_path.name} failed contract checks: {errors}"
else:
assert errors, f"Fixture {fixture_path.name} was expected to fail but passed."


def test_modality_contract_fixture_names_are_unique():
"""Ensure fixture names stay unique for clear test diagnostics."""
fixtures = [_load_fixture(path) for path in sorted(CONTRACT_DIR.glob("*.json"))]
names = [fixture.get("name") for fixture in fixtures]
assert len(names) == len(set(names))
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"name": "image_invalid_missing_payload",
"valid": false,
"piece": {
"type": "IMAGE",
"page_content": "caption only",
"metadata": {
"document": "file:image.png",
"page": 1,
"id": "img-bad-1",
"related": []
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"name": "image_legacy_base64_valid",
"valid": true,
"piece": {
"type": "IMAGE",
"page_content": "diagram describing architecture",
"metadata": {
"document": "file:diagram.png",
"page": 1,
"id": "img-legacy-1",
"related": [],
"base64_image": "iVBORw0KGgoAAAANSUhEUgAA..."
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"name": "image_reference_valid",
"valid": true,
"piece": {
"type": "IMAGE",
"page_content": "OCR fallback text",
"metadata": {
"document": "file:screenshot.jpg",
"page": 1,
"id": "img-ref-1",
"related": [],
"image_ref": "s3://documents/screenshot.jpg",
"image_mime": "image/jpeg"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"name": "table_valid",
"valid": true,
"piece": {
"type": "TABLE",
"page_content": "| A | B |\n| --- | --- |\n| 1 | 2 |",
"metadata": {
"document": "file:sample.csv",
"page": 1,
"id": "tbl-123",
"related": ["chunk-1"]
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"name": "text_valid",
"valid": true,
"piece": {
"type": "TEXT",
"page_content": "example body text",
"metadata": {
"document": "file:sample.txt",
"page": 1,
"id": "abc123",
"related": []
}
}
}