diff --git a/server/Justfile b/server/Justfile index 6f96d22..cf80594 100644 --- a/server/Justfile +++ b/server/Justfile @@ -82,21 +82,23 @@ cli *ARGS: PG_USER := env("PG_USER", "postgres") PG_PASS := env("PG_PASS", "osa") PG_HOST := env("PG_HOST", "localhost") -PG_PORT := env("PG_PORT", "5432") +# Dedicated host port for the integration-test DB, distinct from dev (5432) +# so both can coexist. Override with TEST_PG_PORT if 55432 is also taken. +TEST_PG_PORT := env("TEST_PG_PORT", "55432") TEST_DB := "osa_test" -TEST_DB_URL := "postgresql+asyncpg://" + PG_USER + ":" + PG_PASS + "@" + PG_HOST + ":" + PG_PORT + "/" + TEST_DB +TEST_DB_URL := "postgresql+asyncpg://" + PG_USER + ":" + PG_PASS + "@" + PG_HOST + ":" + TEST_PG_PORT + "/" + TEST_DB # Create test database (idempotent) test-db-create: - PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{PG_PORT}} -U {{PG_USER}} \ + PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{TEST_PG_PORT}} -U {{PG_USER}} \ -tc "SELECT 1 FROM pg_database WHERE datname='{{TEST_DB}}'" \ | grep -q 1 || \ - PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{PG_PORT}} -U {{PG_USER}} \ + PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{TEST_PG_PORT}} -U {{PG_USER}} \ -c "CREATE DATABASE {{TEST_DB}}" # Drop test database test-db-drop: - PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{PG_PORT}} -U {{PG_USER}} \ + PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{TEST_PG_PORT}} -U {{PG_USER}} \ -c "DROP DATABASE IF EXISTS {{TEST_DB}} WITH (FORCE)" # Run integration tests (persistence tests skip if PG is not available) @@ -105,7 +107,7 @@ test-integration: # Run integration tests with PG: ensure DB running → wipe → create → migrate → test → wipe test-integration-pg: - just --justfile ../Justfile db-up + POSTGRES_PORT={{TEST_PG_PORT}} just --justfile ../Justfile db-up @just test-db-drop @just test-db-create OSA_DATABASE__URL="{{TEST_DB_URL}}" \ diff --git a/server/migrations/versions/076_add_feature_tables_record_srn_fks.py b/server/migrations/versions/076_add_feature_tables_record_srn_fks.py new file mode 100644 index 0000000..b61ee14 --- /dev/null +++ b/server/migrations/versions/076_add_feature_tables_record_srn_fks.py @@ -0,0 +1,95 @@ +"""076_add_feature_tables_record_srn_fks + +For each row currently registered in the ``public.feature_tables`` catalog, +add a foreign-key constraint on ``features..record_srn`` referencing +``records.srn`` with ``ON DELETE CASCADE``. Bundles GitHub #75. + +Idempotent: skips any hook whose FK is already present (detected by naming +convention). No-op on greenfield deployments where the catalog is empty. + +Revision ID: 076_feature_fks +Revises: 076_records_schema_srn +Create Date: 2026-04-19 + +""" + +import re +from typing import Sequence, Union + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "076_feature_fks" +down_revision: Union[str, Sequence[str], None] = "076_records_schema_srn" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +FK_NAME_TEMPLATE = "fk_features_{hook}_record_srn" + +# Defense-in-depth: hook names read from ``feature_tables`` are interpolated +# into raw DDL below. Application code constrains hooks to this shape at write +# time, but the migration should not trust that invariant — a stray ``"`` in a +# stored name would break out of quoting. Mirrors the ``_safe_ident`` check in +# ``osa.infrastructure.persistence.metadata_store``. +_PG_IDENT_RE = re.compile(r"^[a-z][a-z0-9_]{0,62}$") + + +def _safe_ident(name: str) -> str: + if not _PG_IDENT_RE.match(name): + raise ValueError(f"Refusing to interpolate unsafe PG identifier {name!r} into DDL") + return name + + +def upgrade() -> None: + conn = op.get_bind() + rows = conn.execute(_select_hooks()).fetchall() + + for row in rows: + hook = _safe_ident(row[0]) + fk_name = _safe_ident(FK_NAME_TEMPLATE.format(hook=hook)) + exists = conn.execute(_check_constraint(fk_name)).scalar() + if exists: + continue + + conn.execute(_add_fk_sql(hook, fk_name)) + + +def downgrade() -> None: + conn = op.get_bind() + rows = conn.execute(_select_hooks()).fetchall() + for row in rows: + hook = _safe_ident(row[0]) + fk_name = _safe_ident(FK_NAME_TEMPLATE.format(hook=hook)) + exists = conn.execute(_check_constraint(fk_name)).scalar() + if not exists: + continue + conn.execute(_drop_fk_sql(hook, fk_name)) + + +def _select_hooks(): + from sqlalchemy import text + + return text("SELECT hook_name FROM feature_tables") + + +def _check_constraint(fk_name: str): + from sqlalchemy import text + + return text("SELECT 1 FROM pg_constraint WHERE conname = :fk_name").bindparams(fk_name=fk_name) + + +def _add_fk_sql(hook: str, fk_name: str): + from sqlalchemy import text + + return text( + f'ALTER TABLE features."{hook}" ' + f'ADD CONSTRAINT "{fk_name}" ' + f"FOREIGN KEY (record_srn) REFERENCES records(srn) ON DELETE CASCADE" + ) + + +def _drop_fk_sql(hook: str, fk_name: str): + from sqlalchemy import text + + return text(f'ALTER TABLE features."{hook}" DROP CONSTRAINT "{fk_name}"') diff --git a/server/migrations/versions/076_add_metadata_schema_and_catalog.py b/server/migrations/versions/076_add_metadata_schema_and_catalog.py new file mode 100644 index 0000000..e425c95 --- /dev/null +++ b/server/migrations/versions/076_add_metadata_schema_and_catalog.py @@ -0,0 +1,47 @@ +"""076_add_metadata_schema_and_catalog + +Create the ``metadata`` PostgreSQL schema and the ``public.metadata_tables`` +catalog table. Dynamic per-schema metadata tables will live inside the +``metadata`` schema; the catalog indexes them by short schema id + major. + +Revision ID: 076_metadata_catalog +Revises: add_deliver_after +Create Date: 2026-04-19 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects.postgresql import JSONB + +# revision identifiers, used by Alembic. +revision: str = "076_metadata_catalog" +down_revision: Union[str, Sequence[str], None] = "add_deliver_after" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute('CREATE SCHEMA IF NOT EXISTS "metadata"') + + op.create_table( + "metadata_tables", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("schema_id", sa.Text(), nullable=False), + sa.Column("schema_slug", sa.Text(), nullable=False), + sa.Column("schema_major", sa.Integer(), nullable=False), + sa.Column("schema_versions", JSONB(), nullable=False), + sa.Column("pg_table", sa.Text(), nullable=False), + sa.Column("metadata_schema", JSONB(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False), + sa.UniqueConstraint("schema_id", "schema_major", name="uq_metadata_tables_id_major"), + sa.UniqueConstraint("pg_table", name="uq_metadata_tables_pg_table"), + ) + + +def downgrade() -> None: + op.drop_table("metadata_tables") + op.execute('DROP SCHEMA IF EXISTS "metadata" CASCADE') diff --git a/server/migrations/versions/076_add_records_schema_srn.py b/server/migrations/versions/076_add_records_schema_srn.py new file mode 100644 index 0000000..585cb0b --- /dev/null +++ b/server/migrations/versions/076_add_records_schema_srn.py @@ -0,0 +1,40 @@ +"""076_add_records_schema_id + +Add ``records.schema_id`` + ``records.schema_version`` so a Record's typed +linkage is first-class (FR-008). + +Greenfield only: no backfill from the linked convention. If this runs +against a populated ``records`` table it fails at ``SET NOT NULL`` with a +clear constraint error, which is the correct signal that the data predates +this schema. + +Revision ID: 076_records_schema_srn +Revises: 076_schemas_to_id +Create Date: 2026-04-19 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "076_records_schema_srn" +down_revision: Union[str, Sequence[str], None] = "076_schemas_to_id" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("records", sa.Column("schema_id", sa.Text(), nullable=True)) + op.add_column("records", sa.Column("schema_version", sa.Text(), nullable=True)) + op.alter_column("records", "schema_id", nullable=False) + op.alter_column("records", "schema_version", nullable=False) + op.create_index("idx_records_schema_id", "records", ["schema_id"]) + + +def downgrade() -> None: + op.drop_index("idx_records_schema_id", table_name="records") + op.drop_column("records", "schema_version") + op.drop_column("records", "schema_id") diff --git a/server/migrations/versions/076_schemas_to_id.py b/server/migrations/versions/076_schemas_to_id.py new file mode 100644 index 0000000..c05f5e2 --- /dev/null +++ b/server/migrations/versions/076_schemas_to_id.py @@ -0,0 +1,66 @@ +"""076_schemas_to_id + +Replace URN-keyed ``schemas`` and ``conventions`` columns with short-form +``(id, version)`` pairs. After this migration, internal code works entirely +in ``SchemaId``; full URNs are reserved for federation edges. + +Changes: +- ``schemas.srn`` → ``schemas.id`` + ``schemas.version``. Composite PK. +- ``conventions.schema_srn`` → ``conventions.schema_id`` + ``conventions.schema_version``. + +Greenfield only: no backfill from the old URN columns. If this runs against +a populated DB it fails at ``SET NOT NULL`` with a clear constraint error, +which is the correct signal that the data predates this schema. + +Revision ID: 076_schemas_to_id +Revises: 076_metadata_catalog +Create Date: 2026-04-20 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "076_schemas_to_id" +down_revision: Union[str, Sequence[str], None] = "076_metadata_catalog" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # schemas: drop old SRN PK, add id + version, recompose PK. + op.add_column("schemas", sa.Column("id", sa.String(), nullable=True)) + op.add_column("schemas", sa.Column("version", sa.String(), nullable=True)) + op.alter_column("schemas", "id", nullable=False) + op.alter_column("schemas", "version", nullable=False) + op.drop_constraint("schemas_pkey", "schemas", type_="primary") + op.drop_column("schemas", "srn") + op.create_primary_key("schemas_pkey", "schemas", ["id", "version"]) + op.create_index("idx_schemas_id", "schemas", ["id"]) + + # conventions: split schema_srn into schema_id + schema_version. + op.add_column("conventions", sa.Column("schema_id", sa.String(), nullable=True)) + op.add_column("conventions", sa.Column("schema_version", sa.String(), nullable=True)) + op.alter_column("conventions", "schema_id", nullable=False) + op.alter_column("conventions", "schema_version", nullable=False) + op.drop_column("conventions", "schema_srn") + + +def downgrade() -> None: + # conventions back to schema_srn + op.add_column("conventions", sa.Column("schema_srn", sa.String(), nullable=True)) + op.alter_column("conventions", "schema_srn", nullable=False) + op.drop_column("conventions", "schema_version") + op.drop_column("conventions", "schema_id") + + # schemas back to srn + op.drop_index("idx_schemas_id", table_name="schemas") + op.drop_constraint("schemas_pkey", "schemas", type_="primary") + op.add_column("schemas", sa.Column("srn", sa.String(), nullable=True)) + op.alter_column("schemas", "srn", nullable=False) + op.create_primary_key("schemas_pkey", "schemas", ["srn"]) + op.drop_column("schemas", "version") + op.drop_column("schemas", "id") diff --git a/server/osa/application/api/v1/routes/discovery.py b/server/osa/application/api/v1/routes/discovery.py index cdf511b..fb4295a 100644 --- a/server/osa/application/api/v1/routes/discovery.py +++ b/server/osa/application/api/v1/routes/discovery.py @@ -6,10 +6,7 @@ from fastapi import APIRouter from pydantic import BaseModel, Field -from osa.domain.discovery.model.value import ( - Filter, - SortOrder, -) +from osa.domain.discovery.model.value import FilterExpr, SortOrder from osa.domain.discovery.query.get_feature_catalog import ( GetFeatureCatalog, GetFeatureCatalogHandler, @@ -25,6 +22,8 @@ SearchRecordsHandler, SearchRecordsResult, ) +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import ConventionSRN, SchemaId router = APIRouter( prefix="/discovery", @@ -37,7 +36,11 @@ class RecordSearchRequest(BaseModel): - filters: list[Filter] = [] + schema: str | None = None + """Short-form schema identity: ``"@"`` (e.g. ``"pdb-structure@1.0.0"``).""" + + convention_srn: ConventionSRN | None = None + filter: FilterExpr | None = None q: str | None = None sort: str = "published_at" order: SortOrder = SortOrder.DESC @@ -56,7 +59,10 @@ class FeatureCatalogResponse(BaseModel): class FeatureSearchRequest(BaseModel): - filters: list[Filter] = [] + schema: str | None = None + """Short-form schema identity, optional. See RecordSearchRequest.schema.""" + + filter: FilterExpr | None = None record_srn: str | None = None sort: str = "id" order: SortOrder = SortOrder.DESC @@ -70,6 +76,24 @@ class FeatureSearchResponse(BaseModel): has_more: bool +def _parse_schema(value: str | None) -> SchemaId | None: + if value is None: + return None + if "@" not in value: + raise ValidationError( + f"Schema {value!r} must be fully qualified as '@' " + "(e.g. 'pdb-structure@1.0.0'). Family-level scoping " + "(id alone, resolving to the latest version across a schema family) " + "is planned but not yet supported.", + field="schema", + code="cross_scope_not_yet_supported", + ) + try: + return SchemaId.parse(value) + except ValueError as exc: + raise ValidationError(str(exc), field="schema") from exc + + # ── Routes ── @@ -81,7 +105,9 @@ async def search_records( """Search and filter published records.""" result: SearchRecordsResult = await handler.run( SearchRecords( - filters=body.filters, + filter_expr=body.filter, + schema_id=_parse_schema(body.schema), + convention_srn=body.convention_srn, q=body.q, sort=body.sort, order=body.order, @@ -115,7 +141,8 @@ async def search_features( result: SearchFeaturesResult = await handler.run( SearchFeatures( hook_name=hook_name, - filters=body.filters, + filter_expr=body.filter, + schema_id=_parse_schema(body.schema), record_srn=body.record_srn, sort=body.sort, order=body.order, diff --git a/server/osa/application/api/v1/routes/schemas.py b/server/osa/application/api/v1/routes/schemas.py index 421a943..6d00ddf 100644 --- a/server/osa/application/api/v1/routes/schemas.py +++ b/server/osa/application/api/v1/routes/schemas.py @@ -18,7 +18,8 @@ ListSchemasHandler, SchemaList, ) -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaId router = APIRouter(prefix="/schemas", tags=["Schemas"], route_class=DishkaRoute) @@ -31,12 +32,17 @@ async def create_schema( return await handler.run(body) -@router.get("/{srn:path}", response_model=SchemaDetail) +@router.get("/{schema:path}", response_model=SchemaDetail) async def get_schema( - srn: str, + schema: str, handler: FromDishka[GetSchemaHandler], ) -> SchemaDetail: - return await handler.run(GetSchema(srn=SchemaSRN.parse(srn))) + """Fetch a schema by its short id+version, e.g. ``"pdb-structure@1.0.0"``.""" + try: + sid = SchemaId.parse(schema) + except ValueError as exc: + raise ValidationError(str(exc), field="schema") from exc + return await handler.run(GetSchema(schema_id=sid)) @router.get("", response_model=SchemaList) diff --git a/server/osa/application/di.py b/server/osa/application/di.py index f7e7ff3..5227635 100644 --- a/server/osa/application/di.py +++ b/server/osa/application/di.py @@ -8,6 +8,7 @@ from osa.domain.deposition.util.di import DepositionProvider from osa.domain.discovery.util.di import DiscoveryProvider from osa.domain.feature.util.di import FeatureProvider +from osa.domain.metadata.util.di import MetadataProvider from osa.domain.semantics.util.di.provider import SemanticsProvider from osa.domain.shared.event import EventHandler from osa.domain.validation.util.di import ValidationProvider @@ -49,6 +50,7 @@ def create_container( HttpProvider(), DepositionProvider(), FeatureProvider(), + MetadataProvider(), SemanticsProvider(), ValidationProvider(), AuthProvider(), diff --git a/server/osa/config.py b/server/osa/config.py index ffeda85..46e0924 100644 --- a/server/osa/config.py +++ b/server/osa/config.py @@ -241,6 +241,11 @@ class Config(BaseSettings): runner: RunnerConfig = RunnerConfig() host_data_dir: str | None = None # Host path for OSA_DATA_DIR (sibling container mounts) + # Discovery filter-tree bounds (feature 076) + discovery_max_filter_depth: int = 10 + discovery_max_predicates: int = 200 + discovery_max_cross_domain_joins: int = 10 + model_config = { "env_prefix": "OSA_", "env_file": ".env", diff --git a/server/osa/domain/deposition/command/create_convention.py b/server/osa/domain/deposition/command/create_convention.py index c50059b..499f68c 100644 --- a/server/osa/domain/deposition/command/create_convention.py +++ b/server/osa/domain/deposition/command/create_convention.py @@ -9,12 +9,20 @@ from osa.domain.shared.command import Command, CommandHandler, Result from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId, SchemaIdentifier class CreateConvention(Command): model_config = ConfigDict(populate_by_name=True) + id: SchemaIdentifier + """Schema slug — becomes the ```` in ``schema_id = @``. + + A convention is a bundle of (schema + validators + file requirements), and + the caller supplies the slug of the embedded schema here. The convention + itself gets an opaque server-generated SRN. + """ + title: str version: str schema_fields: list[FieldDefinition] = Field(alias="schema") @@ -28,7 +36,7 @@ class ConventionCreated(Result): srn: ConventionSRN title: str description: str | None - schema_srn: SchemaSRN + schema_id: SchemaId created_at: datetime @@ -38,6 +46,7 @@ class CreateConventionHandler(CommandHandler[CreateConvention, ConventionCreated async def run(self, cmd: CreateConvention) -> ConventionCreated: convention = await self.convention_service.create_convention( + id=cmd.id, title=cmd.title, version=cmd.version, schema=cmd.schema_fields, @@ -50,6 +59,6 @@ async def run(self, cmd: CreateConvention) -> ConventionCreated: srn=convention.srn, title=convention.title, description=convention.description, - schema_srn=convention.schema_srn, + schema_id=convention.schema_id, created_at=convention.created_at, ) diff --git a/server/osa/domain/deposition/command/upload_spreadsheet.py b/server/osa/domain/deposition/command/upload_spreadsheet.py index c092dd4..ad29269 100644 --- a/server/osa/domain/deposition/command/upload_spreadsheet.py +++ b/server/osa/domain/deposition/command/upload_spreadsheet.py @@ -34,9 +34,9 @@ async def run(self, cmd: UploadSpreadsheet) -> SpreadsheetUploaded: if convention is None: raise NotFoundError(f"Convention not found: {dep.convention_srn}") - schema = await self.schema_reader.get_schema(convention.schema_srn) + schema = await self.schema_reader.get_schema(convention.schema_id) if schema is None: - raise NotFoundError(f"Schema not found: {convention.schema_srn}") + raise NotFoundError(f"Schema not found: {convention.schema_id}") parse_result = self.spreadsheet.parse_upload(schema, cmd.content) diff --git a/server/osa/domain/deposition/event/convention_registered.py b/server/osa/domain/deposition/event/convention_registered.py index 7a318a7..62d2112 100644 --- a/server/osa/domain/deposition/event/convention_registered.py +++ b/server/osa/domain/deposition/event/convention_registered.py @@ -1,17 +1,24 @@ """ConventionRegistered event - emitted when a new convention is created.""" +from osa.domain.semantics.model.value import FieldDefinition from osa.domain.shared.event import Event, EventId from osa.domain.shared.model.hook import HookDefinition -from osa.domain.shared.model.srn import ConventionSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId class ConventionRegistered(Event): """Emitted when a convention is created via deploy. - Carries hook definitions so downstream handlers (e.g. CreateFeatureTables) - can create feature tables without querying the convention repository. + Carries hook definitions so ``CreateFeatureTables`` can create feature + tables without querying the convention repository. + + Carries ``schema_id`` and ``schema_fields`` so ``EnsureMetadataTable`` can + create and evolve typed metadata tables without traversing the semantics + repository. """ id: EventId convention_srn: ConventionSRN + schema_id: SchemaId + schema_fields: list[FieldDefinition] = [] hooks: list[HookDefinition] = [] diff --git a/server/osa/domain/deposition/model/convention.py b/server/osa/domain/deposition/model/convention.py index 63c1e77..53bc3ac 100644 --- a/server/osa/domain/deposition/model/convention.py +++ b/server/osa/domain/deposition/model/convention.py @@ -4,7 +4,7 @@ from osa.domain.shared.model.aggregate import Aggregate from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId class Convention(Aggregate): @@ -13,7 +13,7 @@ class Convention(Aggregate): srn: ConventionSRN title: str description: str | None = None - schema_srn: SchemaSRN + schema_id: SchemaId file_requirements: FileRequirements hooks: list[HookDefinition] = [] ingester: IngesterDefinition | None = None diff --git a/server/osa/domain/deposition/port/schema_reader.py b/server/osa/domain/deposition/port/schema_reader.py index 36f1e00..fb790c3 100644 --- a/server/osa/domain/deposition/port/schema_reader.py +++ b/server/osa/domain/deposition/port/schema_reader.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import TYPE_CHECKING, Protocol -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.port import Port if TYPE_CHECKING: @@ -12,7 +12,7 @@ class SchemaReader(Port, Protocol): """Read-only cross-domain port for reading schemas from the deposition domain.""" @abstractmethod - async def get_schema(self, srn: SchemaSRN) -> "Schema | None": ... + async def get_schema(self, schema_id: SchemaId) -> "Schema | None": ... @abstractmethod - async def schema_exists(self, srn: SchemaSRN) -> bool: ... + async def schema_exists(self, schema_id: SchemaId) -> bool: ... diff --git a/server/osa/domain/deposition/query/download_template.py b/server/osa/domain/deposition/query/download_template.py index eab466a..acaa504 100644 --- a/server/osa/domain/deposition/query/download_template.py +++ b/server/osa/domain/deposition/query/download_template.py @@ -33,9 +33,9 @@ async def run(self, cmd: DownloadTemplate) -> TemplateResult: if convention is None: raise NotFoundError(f"Convention not found: {cmd.convention_srn}") - schema = await self.schema_reader.get_schema(convention.schema_srn) + schema = await self.schema_reader.get_schema(convention.schema_id) if schema is None: - raise NotFoundError(f"Schema not found: {convention.schema_srn}") + raise NotFoundError(f"Schema not found: {convention.schema_id}") # Collect ontology terms for fields that reference ontologies ontology_terms_by_srn: dict[str, list[str]] = {} diff --git a/server/osa/domain/deposition/query/get_convention.py b/server/osa/domain/deposition/query/get_convention.py index b39e467..7bf9d46 100644 --- a/server/osa/domain/deposition/query/get_convention.py +++ b/server/osa/domain/deposition/query/get_convention.py @@ -5,7 +5,7 @@ from osa.domain.shared.authorization.gate import public from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result @@ -17,7 +17,7 @@ class ConventionDetail(Result): srn: ConventionSRN title: str description: str | None - schema_srn: SchemaSRN + schema_id: SchemaId file_requirements: FileRequirements hooks: list[HookDefinition] ingester: IngesterDefinition | None = None @@ -34,7 +34,7 @@ async def run(self, cmd: GetConvention) -> ConventionDetail: srn=conv.srn, title=conv.title, description=conv.description, - schema_srn=conv.schema_srn, + schema_id=conv.schema_id, file_requirements=conv.file_requirements, hooks=conv.hooks, ingester=conv.ingester, diff --git a/server/osa/domain/deposition/query/list_conventions.py b/server/osa/domain/deposition/query/list_conventions.py index f38a07e..0838bd0 100644 --- a/server/osa/domain/deposition/query/list_conventions.py +++ b/server/osa/domain/deposition/query/list_conventions.py @@ -4,7 +4,7 @@ from osa.domain.deposition.service.convention import ConventionService from osa.domain.shared.authorization.gate import public -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result @@ -16,7 +16,7 @@ class ConventionSummary(BaseModel): srn: ConventionSRN title: str description: str | None - schema_srn: SchemaSRN + schema_id: SchemaId created_at: datetime @@ -36,7 +36,7 @@ async def run(self, cmd: ListConventions) -> ConventionList: srn=c.srn, title=c.title, description=c.description, - schema_srn=c.schema_srn, + schema_id=c.schema_id, created_at=c.created_at, ) for c in conventions diff --git a/server/osa/domain/deposition/service/convention.py b/server/osa/domain/deposition/service/convention.py index 79492e9..366ffbe 100644 --- a/server/osa/domain/deposition/service/convention.py +++ b/server/osa/domain/deposition/service/convention.py @@ -5,13 +5,20 @@ from osa.domain.deposition.model.convention import Convention from osa.domain.deposition.model.value import FileRequirements from osa.domain.deposition.port.convention_repository import ConventionRepository +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.semantics.model.value import FieldDefinition from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.error import NotFoundError from osa.domain.shared.event import EventId from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, Domain, LocalId, Semver +from osa.domain.shared.model.srn import ( + ConventionSRN, + Domain, + LocalId, + SchemaIdentifier, + Semver, +) from osa.domain.shared.outbox import Outbox from osa.domain.shared.service import Service @@ -19,11 +26,13 @@ class ConventionService(Service): convention_repo: ConventionRepository schema_service: SchemaService + metadata_service: MetadataService outbox: Outbox node_domain: Domain async def create_convention( self, + id: SchemaIdentifier, title: str, version: str, schema: list[FieldDefinition], @@ -35,18 +44,27 @@ async def create_convention( """Create a convention with an inline schema. The schema is created as a separate Schema row internally, - and the convention references it via schema_srn. + and the convention references it via schema_id. Feature table creation is handled asynchronously by the CreateFeatureTables handler reacting to ConventionRegistered. """ # Create Schema row from inline field definitions created_schema = await self.schema_service.create_schema( + id=id, title=title, version=version, fields=schema, ) + # Create (or additively evolve) the typed metadata table in the same + # transaction — no async window where records can publish against a + # convention whose typed table doesn't exist yet. + await self.metadata_service.ensure_table( + schema_id=created_schema.id, + fields=created_schema.fields, + ) + srn = ConventionSRN( domain=self.node_domain, id=LocalId(str(uuid4())[:20]), @@ -56,7 +74,7 @@ async def create_convention( srn=srn, title=title, description=description, - schema_srn=created_schema.srn, + schema_id=created_schema.id, file_requirements=file_requirements, hooks=hooks or [], ingester=ingester, @@ -68,6 +86,8 @@ async def create_convention( ConventionRegistered( id=EventId(uuid4()), convention_srn=srn, + schema_id=created_schema.id, + schema_fields=created_schema.fields, hooks=convention.hooks, ) ) diff --git a/server/osa/domain/deposition/util/di/provider.py b/server/osa/domain/deposition/util/di/provider.py index 66ea290..a38e7a5 100644 --- a/server/osa/domain/deposition/util/di/provider.py +++ b/server/osa/domain/deposition/util/di/provider.py @@ -20,6 +20,7 @@ from osa.domain.deposition.query.list_depositions import ListDepositionsHandler from osa.domain.deposition.service.convention import ConventionService from osa.domain.deposition.service.deposition import DepositionService +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.model.srn import Domain from osa.domain.shared.outbox import Outbox @@ -51,12 +52,14 @@ def get_convention_service( self, convention_repo: ConventionRepository, schema_service: SchemaService, + metadata_service: MetadataService, outbox: Outbox, config: Config, ) -> ConventionService: return ConventionService( convention_repo=convention_repo, schema_service=schema_service, + metadata_service=metadata_service, outbox=outbox, node_domain=Domain(config.domain), ) diff --git a/server/osa/domain/discovery/model/refs.py b/server/osa/domain/discovery/model/refs.py new file mode 100644 index 0000000..f5783a0 --- /dev/null +++ b/server/osa/domain/discovery/model/refs.py @@ -0,0 +1,78 @@ +"""Typed field references used inside Predicate.field. + +Two kinds of references are supported: + +- :class:`MetadataFieldRef` — resolves to a column in ``metadata._v``. +- :class:`FeatureFieldRef` — resolves to a column in ``features.``. + +Wire format is a dotted path (``metadata.`` or +``features..``). :func:`parse_field_ref` parses the wire form +into a typed reference and validates identifier shape. +""" + +from __future__ import annotations + +import re +from typing import Literal, Union + +from pydantic import BaseModel + +_IDENT = re.compile(r"^[a-z][a-z0-9_]*$") + + +class MetadataFieldRef(BaseModel): + path: Literal["metadata"] = "metadata" + field: str + + def dotted(self) -> str: + return f"metadata.{self.field}" + + +class FeatureFieldRef(BaseModel): + path: Literal["features"] = "features" + hook: str + column: str + + def dotted(self) -> str: + return f"features.{self.hook}.{self.column}" + + +FieldRef = Union[MetadataFieldRef, FeatureFieldRef] + + +def parse_field_ref(dotted: str) -> FieldRef: + """Parse a dotted-path field reference into its typed form. + + Raises :class:`ValueError` when the path shape or identifier doesn't match + the documented grammar. + """ + if not isinstance(dotted, str): + raise ValueError(f"Expected dotted string, got {type(dotted).__name__}") + + parts = dotted.split(".") + if not parts: + raise ValueError(f"Empty field reference: {dotted!r}") + + head = parts[0] + if head == "metadata": + if len(parts) != 2: + raise ValueError(f"metadata.* refs must be exactly two dotted parts, got {dotted!r}") + field = parts[1] + if not _IDENT.match(field): + raise ValueError(f"Invalid metadata field identifier: {field!r}") + return MetadataFieldRef(field=field) + + if head == "features": + if len(parts) != 3: + raise ValueError(f"features.* refs must be exactly three dotted parts, got {dotted!r}") + hook, column = parts[1], parts[2] + if not _IDENT.match(hook): + raise ValueError(f"Invalid hook identifier: {hook!r}") + if not _IDENT.match(column): + raise ValueError(f"Invalid feature column identifier: {column!r}") + return FeatureFieldRef(hook=hook, column=column) + + raise ValueError( + f"Unknown field reference prefix {head!r} in {dotted!r}. " + "Expected 'metadata.' or 'features..'." + ) diff --git a/server/osa/domain/discovery/model/value.py b/server/osa/domain/discovery/model/value.py index 1abab81..65549a0 100644 --- a/server/osa/domain/discovery/model/value.py +++ b/server/osa/domain/discovery/model/value.py @@ -1,4 +1,10 @@ -"""Discovery domain value objects — filters, cursors, result types.""" +"""Discovery domain value objects — filters, cursors, result types. + +Feature 076 replaces the flat ``Filter`` list with a compound ``FilterExpr`` +discriminated union (``And``/``Or``/``Not``/``Predicate``). Field references +inside predicates are typed (:class:`MetadataFieldRef` or +:class:`FeatureFieldRef`); the dotted wire form is parsed at the API boundary. +""" from __future__ import annotations @@ -6,19 +12,29 @@ import json from datetime import datetime from enum import StrEnum -from typing import Any +from typing import Annotated, Any, Literal, Union -from pydantic import BaseModel +from pydantic import BaseModel, Field, model_validator +from osa.domain.discovery.model.refs import ( + FeatureFieldRef, + MetadataFieldRef, + parse_field_ref, +) from osa.domain.semantics.model.value import FieldType from osa.domain.shared.model.srn import RecordSRN class FilterOperator(StrEnum): EQ = "eq" - CONTAINS = "contains" + NEQ = "neq" + GT = "gt" GTE = "gte" + LT = "lt" LTE = "lte" + IN = "in" + CONTAINS = "contains" + IS_NULL = "is_null" class SortOrder(StrEnum): @@ -26,19 +42,136 @@ class SortOrder(StrEnum): DESC = "desc" -class Filter(BaseModel): - field: str - operator: FilterOperator - value: str | float | bool +FieldRef = Annotated[ + Union[MetadataFieldRef, FeatureFieldRef], + Field(discriminator="path"), +] + + +PredicateValue = Union[str, int, float, bool, list[str], list[float], None] + + +class Predicate(BaseModel): + kind: Literal["predicate"] = "predicate" + field: FieldRef + op: FilterOperator + value: PredicateValue = None + + @model_validator(mode="before") + @classmethod + def _coerce_field(cls, data: Any) -> Any: + """Accept dotted-path strings for ``field`` and parse them into the typed form.""" + if isinstance(data, dict): + raw = data.get("field") + if isinstance(raw, str): + data = {**data, "field": parse_field_ref(raw)} + return data + + +class And(BaseModel): + kind: Literal["and"] = "and" + operands: list["FilterExpr"] = Field(min_length=2) +class Or(BaseModel): + kind: Literal["or"] = "or" + operands: list["FilterExpr"] = Field(min_length=2) + + +class Not(BaseModel): + kind: Literal["not"] = "not" + operand: "FilterExpr" + + +FilterExpr = Annotated[ + Union[And, Or, Not, Predicate], + Field(discriminator="kind"), +] + +# Resolve forward references +And.model_rebuild() +Or.model_rebuild() +Not.model_rebuild() + + +# Operators valid per column type for metadata/feature column validation. VALID_OPERATORS: dict[FieldType, set[FilterOperator]] = { - FieldType.TEXT: {FilterOperator.EQ, FilterOperator.CONTAINS}, - FieldType.URL: {FilterOperator.EQ, FilterOperator.CONTAINS}, - FieldType.NUMBER: {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - FieldType.DATE: {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - FieldType.BOOLEAN: {FilterOperator.EQ}, - FieldType.TERM: {FilterOperator.EQ}, + FieldType.TEXT: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.CONTAINS, + FilterOperator.IS_NULL, + }, + FieldType.URL: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.CONTAINS, + FilterOperator.IS_NULL, + }, + FieldType.TERM: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + FieldType.NUMBER: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + FieldType.DATE: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + FieldType.BOOLEAN: {FilterOperator.EQ, FilterOperator.IS_NULL}, +} + +# Operators valid against raw JSON-schema primitive types (used for feature columns +# whose Column.json_type is a JSON Schema primitive rather than a semantic FieldType). +JSON_TYPE_OPERATORS: dict[str, set[FilterOperator]] = { + "string": { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.CONTAINS, + FilterOperator.IS_NULL, + }, + "number": { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + "integer": { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + "boolean": {FilterOperator.EQ, FilterOperator.IS_NULL}, + "array": {FilterOperator.EQ, FilterOperator.IS_NULL}, + "object": {FilterOperator.EQ, FilterOperator.IS_NULL}, } diff --git a/server/osa/domain/discovery/port/field_definition_reader.py b/server/osa/domain/discovery/port/field_definition_reader.py index b763c8a..c2a6bfa 100644 --- a/server/osa/domain/discovery/port/field_definition_reader.py +++ b/server/osa/domain/discovery/port/field_definition_reader.py @@ -6,6 +6,7 @@ if TYPE_CHECKING: from osa.domain.semantics.model.value import FieldType + from osa.domain.shared.model.srn import SchemaId class FieldDefinitionReader(Protocol): @@ -15,3 +16,13 @@ async def get_all_field_types(self) -> dict[str, FieldType]: Raises ValidationError if same field name has conflicting types across schemas. """ ... + + async def get_fields_for_schema(self, schema_id: "SchemaId") -> dict[str, FieldType]: + """Return field_name -> FieldType for a specific schema's current major version. + + Returns an empty dict when the schema is unknown to the node. Callers + that treat "unknown schema" as an error condition must check for an + empty map and raise ``NotFoundError`` themselves — the port stays + neutral so that non-user-facing callers can handle absence explicitly. + """ + ... diff --git a/server/osa/domain/discovery/port/read_store.py b/server/osa/domain/discovery/port/read_store.py index 6ac054d..762364e 100644 --- a/server/osa/domain/discovery/port/read_store.py +++ b/server/osa/domain/discovery/port/read_store.py @@ -1,4 +1,4 @@ -"""DiscoveryReadStore port — read-only access to records and feature data.""" +"""DiscoveryReadStore port — read-only access to records, features, metadata.""" from __future__ import annotations @@ -8,49 +8,43 @@ from osa.domain.discovery.model.value import ( FeatureCatalogEntry, FeatureRow, - Filter, + FilterExpr, RecordSummary, SortOrder, ) from osa.domain.semantics.model.value import FieldType - from osa.domain.shared.model.srn import RecordSRN + from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId class DiscoveryReadStore(Protocol): async def search_records( self, - filters: list[Filter], + filter_expr: "FilterExpr | None", + schema_id: "SchemaId | None", + convention_srn: "ConventionSRN | None", text_fields: list[str], q: str | None, sort: str, - order: SortOrder, + order: "SortOrder", cursor: dict | None, limit: int, - field_types: dict[str, FieldType] | None = None, - ) -> list[RecordSummary]: - """Search and filter published records.""" + field_types: "dict[str, FieldType] | None" = None, + ) -> "list[RecordSummary]": + """Search published records with a compound filter.""" ... - async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: - """List all feature tables with column schemas and record counts.""" - ... - - async def get_feature_table_schema(self, hook_name: str) -> FeatureCatalogEntry | None: - """Look up a single feature table's schema by hook name. + async def get_feature_catalog(self) -> "list[FeatureCatalogEntry]": ... - Returns None if the hook_name is not found. - """ - ... + async def get_feature_table_schema(self, hook_name: str) -> "FeatureCatalogEntry | None": ... async def search_features( self, hook_name: str, - filters: list[Filter], - record_srn: RecordSRN | None, + filter_expr: "FilterExpr | None", + schema_id: "SchemaId | None", + record_srn: "RecordSRN | None", sort: str, - order: SortOrder, + order: "SortOrder", cursor: dict | None, limit: int, - ) -> list[FeatureRow]: - """Search and filter feature rows.""" - ... + ) -> "list[FeatureRow]": ... diff --git a/server/osa/domain/discovery/query/search_features.py b/server/osa/domain/discovery/query/search_features.py index 4019dcf..42dde9a 100644 --- a/server/osa/domain/discovery/query/search_features.py +++ b/server/osa/domain/discovery/query/search_features.py @@ -2,19 +2,20 @@ from osa.domain.discovery.model.value import ( FeatureSearchResult, - Filter, + FilterExpr, SortOrder, ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.shared.authorization.gate import public from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result class SearchFeatures(Query): hook_name: str - filters: list[Filter] = [] + filter_expr: FilterExpr | None = None + schema_id: SchemaId | None = None record_srn: str | None = None sort: str = "id" order: SortOrder = SortOrder.DESC @@ -41,7 +42,8 @@ async def run(self, cmd: SearchFeatures) -> SearchFeaturesResult: raise ValidationError(str(exc), field="record_srn") from exc result: FeatureSearchResult = await self.discovery_service.search_features( hook_name=cmd.hook_name, - filters=cmd.filters, + filter_expr=cmd.filter_expr, + schema_id=cmd.schema_id, record_srn=record_srn, sort=cmd.sort, order=cmd.order, diff --git a/server/osa/domain/discovery/query/search_records.py b/server/osa/domain/discovery/query/search_records.py index eed8957..515d980 100644 --- a/server/osa/domain/discovery/query/search_records.py +++ b/server/osa/domain/discovery/query/search_records.py @@ -1,17 +1,22 @@ """SearchRecords query — search and filter published records.""" +from typing import Any + from osa.domain.discovery.model.value import ( - Filter, + FilterExpr, RecordSearchResult, SortOrder, ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.shared.authorization.gate import public +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result class SearchRecords(Query): - filters: list[Filter] = [] + filter_expr: FilterExpr | None = None + schema_id: SchemaId | None = None + convention_srn: ConventionSRN | None = None q: str | None = None sort: str = "published_at" order: SortOrder = SortOrder.DESC @@ -20,7 +25,7 @@ class SearchRecords(Query): class SearchRecordsResult(Result): - results: list[dict] + results: list[dict[str, Any]] cursor: str | None has_more: bool @@ -31,7 +36,9 @@ class SearchRecordsHandler(QueryHandler[SearchRecords, SearchRecordsResult]): async def run(self, cmd: SearchRecords) -> SearchRecordsResult: result: RecordSearchResult = await self.discovery_service.search_records( - filters=cmd.filters, + filter_expr=cmd.filter_expr, + schema_id=cmd.schema_id, + convention_srn=cmd.convention_srn, q=cmd.q, sort=cmd.sort, order=cmd.order, diff --git a/server/osa/domain/discovery/service/discovery.py b/server/osa/domain/discovery/service/discovery.py index 5d9bab0..cd8db52 100644 --- a/server/osa/domain/discovery/service/discovery.py +++ b/server/osa/domain/discovery/service/discovery.py @@ -1,15 +1,30 @@ -"""DiscoveryService — read-only business logic for record and feature search.""" +"""DiscoveryService — read-only business logic for record and feature search. + +Validates the compound ``FilterExpr`` tree (bounds, field resolution, operator +compatibility) before handing it to the read store for SQL compilation. +""" from __future__ import annotations import logging +from typing import Any +from osa.config import Config +from osa.domain.discovery.model.refs import ( + FeatureFieldRef, + MetadataFieldRef, +) from osa.domain.discovery.model.value import ( + JSON_TYPE_OPERATORS, VALID_OPERATORS, + And, FeatureCatalog, FeatureSearchResult, - Filter, + FilterExpr, FilterOperator, + Not, + Or, + Predicate, RecordSearchResult, SortOrder, decode_cursor, @@ -19,7 +34,7 @@ from osa.domain.discovery.port.read_store import DiscoveryReadStore from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId from osa.domain.shared.service import Service logger = logging.getLogger(__name__) @@ -30,72 +45,96 @@ class DiscoveryService(Service): read_store: DiscoveryReadStore field_reader: FieldDefinitionReader + config: Config async def search_records( self, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_id: SchemaId | None, + convention_srn: ConventionSRN | None, q: str | None, sort: str, order: SortOrder, cursor: str | None, limit: int, + *, + allow_compound: bool = True, ) -> RecordSearchResult: - """Validate inputs and delegate record search to the read store.""" + """Validate the filter tree and delegate record search to the read store. + + ``allow_compound`` is a staged flag — US1 delivers AND-only + Predicate + support; US2 flips this to allow OR/NOT. Callers should leave it True + once US2 lands. + """ if limit < 1 or limit > 100: raise ValidationError("limit must be between 1 and 100", field="limit") - field_map = await self.field_reader.get_all_field_types() + if sort != "published_at" and schema_id is None: + raise ValidationError( + f"Sorting by '{sort}' requires the request to pin a 'schema' " + "('@'). Plain listings must sort by 'published_at'.", + field="sort", + code="schema_required_for_metadata_sort", + ) - # Validate filter fields and operators - for f in filters: - if f.field not in field_map: - raise ValidationError( - f"Unknown field '{f.field}': not defined in any registered schema", - field=f.field, - ) - field_type = field_map[f.field] - valid_ops = VALID_OPERATORS[field_type] - if f.operator not in valid_ops: - raise ValidationError( - f"Operator '{f.operator}' is not valid for field '{f.field}' " - f"(type '{field_type}'). Valid: {sorted(valid_ops)}", - field=f.field, + if q and schema_id is None: + raise ValidationError( + "Free-text search ('q') requires the request to pin a 'schema' " + "('@'). Without a schema, the server cannot resolve " + "which metadata fields are text-indexed.", + field="q", + code="schema_required_for_free_text_search", + ) + + schema_field_map: dict[str, FieldType] = {} + if schema_id is not None: + schema_field_map = await self.field_reader.get_fields_for_schema(schema_id) + if not schema_field_map: + raise NotFoundError( + f"Schema not found: {schema_id.render()}. " + "Pin an '@' that matches a registered schema." ) - # Validate sort field - if sort != "published_at" and sort not in field_map: + if filter_expr is not None: + self._validate_tree(filter_expr, allow_compound=allow_compound) + await self._validate_refs(filter_expr, schema_id, schema_field_map) + + # Sort field validation (against pinned schema) + if sort != "published_at" and sort not in schema_field_map: raise ValidationError( - f"Unknown sort field '{sort}': not defined in any registered schema", + f"Unknown sort field '{sort}': not defined in the pinned schema.", field="sort", + code="unknown_sort_field", ) - # Decode cursor - decoded_cursor = None + decoded_cursor: dict[str, Any] | None = None if cursor is not None: try: decoded_cursor = decode_cursor(cursor) except ValueError as exc: raise ValidationError(str(exc), field="cursor") from exc - # Identify text-searchable fields for free-text q text_fields = [ - name for name, ft in field_map.items() if ft in (FieldType.TEXT, FieldType.URL) + name for name, ft in schema_field_map.items() if ft in (FieldType.TEXT, FieldType.URL) ] if q and not text_fields: raise ValidationError( - "Free-text search is unavailable: no text or URL fields are registered", + "Free-text search is unavailable: the pinned schema defines no text or URL fields.", field="q", + code="no_text_fields_in_schema", ) results = await self.read_store.search_records( - filters=filters, + filter_expr=filter_expr, + schema_id=schema_id, + convention_srn=convention_srn, text_fields=text_fields, q=q, sort=sort, order=order, cursor=decoded_cursor, limit=limit + 1, - field_types=field_map, + field_types=schema_field_map, ) has_more = len(results) > limit @@ -116,68 +155,56 @@ async def search_records( ) async def get_feature_catalog(self) -> FeatureCatalog: - """Delegate feature catalog listing to the read store.""" entries = await self.read_store.get_feature_catalog() return FeatureCatalog(tables=entries) async def search_features( self, hook_name: str, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_id: SchemaId | None, record_srn: RecordSRN | None, sort: str, order: SortOrder, cursor: str | None, limit: int, + *, + allow_compound: bool = True, ) -> FeatureSearchResult: - """Validate inputs and delegate feature search to the read store.""" if limit < 1 or limit > 100: raise ValidationError("limit must be between 1 and 100", field="limit") - # Look up the feature table schema entry = await self.read_store.get_feature_table_schema(hook_name) if entry is None: raise NotFoundError(f"Feature table not found: {hook_name}") - # Build column type map from catalog schema col_map: dict[str, str] = {col.name: col.type for col in entry.columns} - # Also allow sort/filter on record_srn col_map["record_srn"] = "string" - # Map JSON types to FieldType equivalents for operator validation - json_type_to_ops: dict[str, set[FilterOperator]] = { - "string": {FilterOperator.EQ, FilterOperator.CONTAINS}, - "number": {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - "integer": {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - "boolean": {FilterOperator.EQ}, - "array": {FilterOperator.EQ}, - "object": {FilterOperator.EQ}, - } - - # Validate filters - for f in filters: - if f.field not in col_map: - raise ValidationError( - f"Unknown column '{f.field}' in feature table '{hook_name}'", - field=f.field, - ) - json_type = col_map[f.field] - valid_ops = json_type_to_ops.get(json_type, {FilterOperator.EQ}) - if f.operator not in valid_ops: - raise ValidationError( - f"Operator '{f.operator}' is not valid for column '{f.field}' " - f"(type '{json_type}'). Valid: {sorted(valid_ops)}", - field=f.field, + schema_field_map: dict[str, FieldType] = {} + if schema_id is not None: + schema_field_map = await self.field_reader.get_fields_for_schema(schema_id) + if not schema_field_map: + raise NotFoundError( + f"Schema not found: {schema_id.render()}. " + "Pin an '@' that matches a registered schema." ) - # Validate sort column + if filter_expr is not None: + self._validate_tree(filter_expr, allow_compound=allow_compound) + self._validate_feature_refs( + filter_expr, + this_hook=hook_name, + feature_col_map=col_map, + schema_field_map=schema_field_map, + ) + if sort != "id" and sort not in col_map: raise ValidationError( f"Unknown sort column '{sort}' in feature table '{hook_name}'", field="sort", ) - # Decode cursor try: decoded_cursor = decode_cursor(cursor) if cursor else None except ValueError as exc: @@ -185,7 +212,8 @@ async def search_features( rows = await self.read_store.search_features( hook_name=hook_name, - filters=filters, + filter_expr=filter_expr, + schema_id=schema_id, record_srn=record_srn, sort=sort, order=order, @@ -204,8 +232,192 @@ async def search_features( sort_val = last.data.get(sort) next_cursor = encode_cursor(sort_val, last.row_id) - return FeatureSearchResult( - rows=rows, - cursor=next_cursor, - has_more=has_more, - ) + return FeatureSearchResult(rows=rows, cursor=next_cursor, has_more=has_more) + + # ------------------------- internal helpers ------------------------- + + def _validate_tree(self, expr: FilterExpr, *, allow_compound: bool) -> None: + """Enforce tree bounds (depth, predicate count, joins) + compound gating.""" + depth = _tree_depth(expr) + predicates = list(_iter_predicates(expr)) + + if depth > self.config.discovery_max_filter_depth: + raise ValidationError( + f"Filter tree depth {depth} exceeds configured maximum " + f"{self.config.discovery_max_filter_depth} (OSA_DISCOVERY_MAX_FILTER_DEPTH).", + field="filter", + code="filter_depth_exceeded", + ) + if len(predicates) > self.config.discovery_max_predicates: + raise ValidationError( + f"Filter tree has {len(predicates)} predicate leaves, exceeds " + f"configured maximum {self.config.discovery_max_predicates} " + "(OSA_DISCOVERY_MAX_PREDICATES).", + field="filter", + code="filter_predicates_exceeded", + ) + + distinct_hooks: set[str] = set() + for p in predicates: + if isinstance(p.field, FeatureFieldRef): + distinct_hooks.add(p.field.hook) + if len(distinct_hooks) > self.config.discovery_max_cross_domain_joins: + raise ValidationError( + f"Filter tree joins {len(distinct_hooks)} distinct feature hooks, " + f"exceeds configured maximum " + f"{self.config.discovery_max_cross_domain_joins} " + "(OSA_DISCOVERY_MAX_CROSS_DOMAIN_JOINS).", + field="filter", + code="filter_joins_exceeded", + ) + + if not allow_compound: + for node in _iter_nodes(expr): + if isinstance(node, (Or, Not)): + raise ValidationError( + "Compound OR/NOT filters are not enabled in this build.", + field="filter", + code="compound_disabled", + ) + + async def _validate_refs( + self, + expr: FilterExpr, + schema_id: SchemaId | None, + field_map: dict[str, FieldType], + ) -> None: + """Resolve each predicate's field and check operator compatibility.""" + feature_catalog: dict[str, dict[str, str]] | None = None + for p in _iter_predicates(expr): + if isinstance(p.field, MetadataFieldRef): + if schema_id is None: + raise ValidationError( + f"Metadata predicate on {p.field.dotted()!r} requires " + "the request to pin a 'schema' ('@'). " + "Unscoped metadata filtering is not supported — the typed " + "metadata table is the only filter path.", + field=p.field.dotted(), + code="schema_required_for_metadata_query", + ) + field_name = p.field.field + if field_name not in field_map: + raise ValidationError( + f"Unknown metadata field '{field_name}' for the pinned schema.", + field=p.field.dotted(), + code="unknown_field", + ) + self._check_operator_for_field_type( + p, field_type=field_map[field_name], path=p.field.dotted() + ) + elif isinstance(p.field, FeatureFieldRef): + if feature_catalog is None: + feature_catalog = await self._load_feature_catalog() + cols = feature_catalog.get(p.field.hook) + if cols is None: + raise ValidationError( + f"Unknown feature hook '{p.field.hook}'.", + field=p.field.dotted(), + code="unknown_hook", + ) + if p.field.column not in cols: + raise ValidationError( + f"Unknown feature column '{p.field.column}' on hook '{p.field.hook}'.", + field=p.field.dotted(), + code="unknown_field", + ) + json_type = cols[p.field.column] + self._check_operator_for_json_type(p, json_type=json_type, path=p.field.dotted()) + + def _validate_feature_refs( + self, + expr: FilterExpr, + *, + this_hook: str, + feature_col_map: dict[str, str], + schema_field_map: dict[str, FieldType], + ) -> None: + """Variant of ref validation for feature search — local hook columns by default.""" + for p in _iter_predicates(expr): + if isinstance(p.field, MetadataFieldRef): + if p.field.field not in schema_field_map: + raise ValidationError( + f"Unknown metadata field '{p.field.field}' for the provided schema.", + field=p.field.dotted(), + code="unknown_field", + ) + self._check_operator_for_field_type( + p, field_type=schema_field_map[p.field.field], path=p.field.dotted() + ) + elif isinstance(p.field, FeatureFieldRef): + if p.field.hook != this_hook: + # Cross-hook joins handled by US3 — accepted here, resolved in adapter. + continue + if p.field.column not in feature_col_map: + raise ValidationError( + f"Unknown feature column '{p.field.column}' on hook '{this_hook}'.", + field=p.field.dotted(), + code="unknown_field", + ) + self._check_operator_for_json_type( + p, json_type=feature_col_map[p.field.column], path=p.field.dotted() + ) + + async def _load_feature_catalog(self) -> dict[str, dict[str, str]]: + """Build hook_name → {column_name → json_type} map from the catalog.""" + catalog = await self.read_store.get_feature_catalog() + return {entry.hook_name: {col.name: col.type for col in entry.columns} for entry in catalog} + + @staticmethod + def _check_operator_for_field_type( + predicate: Predicate, *, field_type: FieldType, path: str + ) -> None: + valid = VALID_OPERATORS.get(field_type, set()) + if predicate.op not in valid: + raise ValidationError( + f"Operator '{predicate.op}' is not valid for field '{path}' " + f"(type '{field_type}'). Valid: {sorted(valid)}.", + field=path, + code="operator_not_valid_for_type", + ) + + @staticmethod + def _check_operator_for_json_type(predicate: Predicate, *, json_type: str, path: str) -> None: + valid = JSON_TYPE_OPERATORS.get(json_type, {FilterOperator.EQ}) + if predicate.op not in valid: + raise ValidationError( + f"Operator '{predicate.op}' is not valid for column '{path}' " + f"(json type '{json_type}'). Valid: {sorted(valid)}.", + field=path, + code="operator_not_valid_for_type", + ) + + +def _tree_depth(expr: FilterExpr) -> int: + if isinstance(expr, Predicate): + return 1 + if isinstance(expr, Not): + return 1 + _tree_depth(expr.operand) + if isinstance(expr, (And, Or)): + return 1 + max(_tree_depth(op) for op in expr.operands) + return 1 + + +def _iter_predicates(expr: FilterExpr): + if isinstance(expr, Predicate): + yield expr + return + if isinstance(expr, Not): + yield from _iter_predicates(expr.operand) + return + if isinstance(expr, (And, Or)): + for op in expr.operands: + yield from _iter_predicates(op) + + +def _iter_nodes(expr: FilterExpr): + yield expr + if isinstance(expr, Not): + yield from _iter_nodes(expr.operand) + elif isinstance(expr, (And, Or)): + for op in expr.operands: + yield from _iter_nodes(op) diff --git a/server/osa/domain/discovery/util/di/provider.py b/server/osa/domain/discovery/util/di/provider.py index 325367f..9715b8c 100644 --- a/server/osa/domain/discovery/util/di/provider.py +++ b/server/osa/domain/discovery/util/di/provider.py @@ -2,6 +2,7 @@ from dishka import provide +from osa.config import Config from osa.domain.discovery.port.field_definition_reader import FieldDefinitionReader from osa.domain.discovery.port.read_store import DiscoveryReadStore from osa.domain.discovery.query.get_feature_catalog import GetFeatureCatalogHandler @@ -18,8 +19,13 @@ def get_discovery_service( self, read_store: DiscoveryReadStore, field_reader: FieldDefinitionReader, + config: Config, ) -> DiscoveryService: - return DiscoveryService(read_store=read_store, field_reader=field_reader) + return DiscoveryService( + read_store=read_store, + field_reader=field_reader, + config=config, + ) # Query Handlers search_records_handler = provide(SearchRecordsHandler, scope=Scope.UOW) diff --git a/server/osa/domain/feature/event/__init__.py b/server/osa/domain/feature/event/__init__.py index 32ca6a4..8112dda 100644 --- a/server/osa/domain/feature/event/__init__.py +++ b/server/osa/domain/feature/event/__init__.py @@ -1,5 +1,3 @@ """Feature domain events.""" -from osa.domain.feature.event.convention_ready import ConventionReady - -__all__ = ["ConventionReady"] +__all__: list[str] = [] diff --git a/server/osa/domain/feature/event/convention_ready.py b/server/osa/domain/feature/event/convention_ready.py deleted file mode 100644 index 42b627e..0000000 --- a/server/osa/domain/feature/event/convention_ready.py +++ /dev/null @@ -1,14 +0,0 @@ -"""ConventionReady event — emitted after feature tables are created for a convention.""" - -from osa.domain.shared.event import Event, EventId -from osa.domain.shared.model.srn import ConventionSRN - - -class ConventionReady(Event): - """Emitted when feature tables have been created for a convention. - - Downstream handlers react to this knowing that feature tables are ready. - """ - - id: EventId - convention_srn: ConventionSRN diff --git a/server/osa/domain/feature/handler/create_feature_tables.py b/server/osa/domain/feature/handler/create_feature_tables.py index 2dd815b..0ad16a4 100644 --- a/server/osa/domain/feature/handler/create_feature_tables.py +++ b/server/osa/domain/feature/handler/create_feature_tables.py @@ -1,27 +1,24 @@ """CreateFeatureTables — creates feature tables when a convention is registered.""" import logging -from uuid import uuid4 from osa.domain.deposition.event.convention_registered import ConventionRegistered -from osa.domain.feature.event.convention_ready import ConventionReady from osa.domain.feature.service.feature import FeatureService from osa.domain.shared.error import ConflictError -from osa.domain.shared.event import EventHandler, EventId -from osa.domain.shared.outbox import Outbox +from osa.domain.shared.event import EventHandler logger = logging.getLogger(__name__) class CreateFeatureTables(EventHandler[ConventionRegistered]): - """Creates feature tables for each hook and emits ConventionReady. + """Creates feature tables for each hook declared on a registered convention. - Part of the convention initialization chain: - ConventionRegistered → CreateFeatureTables → ConventionReady + Readiness is not signalled via a follow-on event — consumers check the + ``feature_tables`` + ``metadata_tables`` catalogs at read time instead + (research.md §11). """ feature_service: FeatureService - outbox: Outbox async def handle(self, event: ConventionRegistered) -> None: for hook in event.hooks: @@ -38,11 +35,3 @@ async def handle(self, event: ConventionRegistered) -> None: hook.name, event.convention_srn, ) - - await self.outbox.append( - ConventionReady( - id=EventId(uuid4()), - convention_srn=event.convention_srn, - ) - ) - logger.info("Convention ready: %s", event.convention_srn) diff --git a/server/osa/domain/metadata/__init__.py b/server/osa/domain/metadata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/event/__init__.py b/server/osa/domain/metadata/event/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/handler/__init__.py b/server/osa/domain/metadata/handler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/model/__init__.py b/server/osa/domain/metadata/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/model/value.py b/server/osa/domain/metadata/model/value.py new file mode 100644 index 0000000..d220ab1 --- /dev/null +++ b/server/osa/domain/metadata/model/value.py @@ -0,0 +1,16 @@ +"""Metadata domain value objects — MetadataSchema, slug helpers.""" + +from __future__ import annotations + +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.value import ValueObject + + +class MetadataSchema(ValueObject): + """Typed projection of a Schema into dynamic-column form. + + Mirrors :class:`FeatureSchema` — serialised into the catalog row's + ``metadata_schema`` JSONB column and rehydrated on subsequent reads. + """ + + columns: list[ColumnDef] = [] diff --git a/server/osa/domain/metadata/port/__init__.py b/server/osa/domain/metadata/port/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/port/metadata_store.py b/server/osa/domain/metadata/port/metadata_store.py new file mode 100644 index 0000000..cec1d74 --- /dev/null +++ b/server/osa/domain/metadata/port/metadata_store.py @@ -0,0 +1,55 @@ +"""MetadataStore port — DDL + DML for typed per-schema metadata tables.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Protocol + +if TYPE_CHECKING: + from osa.domain.semantics.model.value import FieldDefinition + from osa.domain.shared.model.srn import RecordSRN, SchemaId + + +class MetadataStore(Protocol): + """Port owned by the metadata domain. + + Implementations are responsible for: + - Creating the ``metadata._v`` table on first + registration for a ``(schema_id, major)`` pair. + - Additively ALTER ADD COLUMN when the schema bumps (minor/patch) with + new optional fields. + - Appending version lineage into the catalog's ``schema_versions`` list. + - Idempotent UPSERT of a row keyed on ``record_srn``. + """ + + async def ensure_table( + self, + schema_id: "SchemaId", + fields: "list[FieldDefinition]", + ) -> None: + """Create or additively evolve the typed metadata table for a schema. + + The PG table slug is derived from ``schema_id.id.root`` — the schema's + human-readable slug is the single source of truth for the storage name. + """ + ... + + async def insert( + self, + schema_id: "SchemaId", + record_srn: "RecordSRN", + values: dict[str, Any], + ) -> None: + """Upsert a record's typed metadata row into the schema's table.""" + ... + + async def insert_many( + self, + schema_id: "SchemaId", + rows: "list[tuple[RecordSRN, dict[str, Any]]]", + ) -> None: + """Bulk upsert typed metadata rows — one multi-row SQL statement. + + All rows must belong to the same schema; callers group by schema_id + before calling. Empty ``rows`` is a no-op. + """ + ... diff --git a/server/osa/domain/metadata/service/__init__.py b/server/osa/domain/metadata/service/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/service/metadata.py b/server/osa/domain/metadata/service/metadata.py new file mode 100644 index 0000000..8bff8fa --- /dev/null +++ b/server/osa/domain/metadata/service/metadata.py @@ -0,0 +1,38 @@ +"""MetadataService — thin delegator over the MetadataStore port.""" + +from __future__ import annotations + +from typing import Any + +from osa.domain.metadata.port.metadata_store import MetadataStore +from osa.domain.semantics.model.value import FieldDefinition +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.domain.shared.service import Service + + +class MetadataService(Service): + """Creates/evolves typed metadata tables and inserts record metadata.""" + + metadata_store: MetadataStore + + async def ensure_table( + self, + schema_id: SchemaId, + fields: list[FieldDefinition], + ) -> None: + await self.metadata_store.ensure_table(schema_id, fields) + + async def insert( + self, + schema_id: SchemaId, + record_srn: RecordSRN, + values: dict[str, Any], + ) -> None: + await self.metadata_store.insert(schema_id, record_srn, values) + + async def insert_many( + self, + schema_id: SchemaId, + rows: list[tuple[RecordSRN, dict[str, Any]]], + ) -> None: + await self.metadata_store.insert_many(schema_id, rows) diff --git a/server/osa/domain/metadata/util/__init__.py b/server/osa/domain/metadata/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/util/di/__init__.py b/server/osa/domain/metadata/util/di/__init__.py new file mode 100644 index 0000000..1013d4d --- /dev/null +++ b/server/osa/domain/metadata/util/di/__init__.py @@ -0,0 +1,3 @@ +from osa.domain.metadata.util.di.provider import MetadataProvider + +__all__ = ["MetadataProvider"] diff --git a/server/osa/domain/metadata/util/di/provider.py b/server/osa/domain/metadata/util/di/provider.py new file mode 100644 index 0000000..379f23b --- /dev/null +++ b/server/osa/domain/metadata/util/di/provider.py @@ -0,0 +1,11 @@ +"""DI provider for the metadata bounded context.""" + +from dishka import provide + +from osa.domain.metadata.service.metadata import MetadataService +from osa.util.di.base import Provider +from osa.util.di.scope import Scope + + +class MetadataProvider(Provider): + service = provide(MetadataService, scope=Scope.UOW) diff --git a/server/osa/domain/record/event/record_published.py b/server/osa/domain/record/event/record_published.py index 2c6dc62..6cb1637 100644 --- a/server/osa/domain/record/event/record_published.py +++ b/server/osa/domain/record/event/record_published.py @@ -4,20 +4,20 @@ from osa.domain.shared.event import Event, EventId from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId class RecordPublished(Event): """Emitted when a record is published and ready for indexing. - Enriched with source, convention_srn, and expected_features so downstream - consumers (feature insertion, indexing) can operate without querying - record/convention repositories. + Carries ``schema_id`` so downstream consumers (metadata insertion, + indexing) operate in terms of short-form identity rather than full URNs. """ id: EventId record_srn: RecordSRN source: RecordSource convention_srn: ConventionSRN + schema_id: SchemaId metadata: dict[str, Any] expected_features: list[str] = [] diff --git a/server/osa/domain/record/model/aggregate.py b/server/osa/domain/record/model/aggregate.py index 69e5575..8b2f491 100644 --- a/server/osa/domain/record/model/aggregate.py +++ b/server/osa/domain/record/model/aggregate.py @@ -3,9 +3,11 @@ from datetime import datetime from typing import Any +from pydantic import Field + from osa.domain.shared.model.aggregate import Aggregate from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId class Record(Aggregate): @@ -14,5 +16,6 @@ class Record(Aggregate): srn: RecordSRN source: RecordSource convention_srn: ConventionSRN + schema_id: SchemaId = Field(frozen=True) metadata: dict[str, Any] published_at: datetime diff --git a/server/osa/domain/record/service/record.py b/server/osa/domain/record/service/record.py index e2409ce..e84512b 100644 --- a/server/osa/domain/record/service/record.py +++ b/server/osa/domain/record/service/record.py @@ -7,6 +7,8 @@ from typing import TYPE_CHECKING, Any from uuid import uuid4 +from osa.domain.deposition.port.convention_repository import ConventionRepository +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.record.event.record_published import RecordPublished from osa.domain.record.model.aggregate import Record from osa.domain.record.model.draft import RecordDraft @@ -14,10 +16,12 @@ from osa.domain.shared.error import NotFoundError from osa.domain.shared.event import EventId from osa.domain.shared.model.srn import ( + ConventionSRN, Domain, LocalId, RecordSRN, RecordVersion, + SchemaId, ) from osa.domain.shared.outbox import Outbox from osa.domain.shared.service import Service @@ -32,6 +36,8 @@ class RecordService(Service): """Creates and persists Record aggregates from any source.""" record_repo: RecordRepository + convention_repo: ConventionRepository + metadata_service: MetadataService outbox: Outbox node_domain: Domain feature_reader: FeatureReader @@ -49,6 +55,13 @@ async def get(self, srn: RecordSRN) -> Record: raise NotFoundError(f"Record not found: {srn}") return record + async def _resolve_schema_id(self, convention_srn: ConventionSRN) -> SchemaId: + """Resolve a convention to its schema id at publication time.""" + convention = await self.convention_repo.get(convention_srn) + if convention is None: + raise NotFoundError(f"Convention not found: {convention_srn}") + return convention.schema_id + async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: """Bulk-publish records from an ingest batch. @@ -59,8 +72,15 @@ async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: if not drafts: return [] + # All drafts in a batch target the same convention (caller contract); + # resolve schema_id once. + schema_id_by_conv: dict[str, SchemaId] = {} + records: list[Record] = [] for draft in drafts: + key = str(draft.convention_srn) + if key not in schema_id_by_conv: + schema_id_by_conv[key] = await self._resolve_schema_id(draft.convention_srn) record_srn = RecordSRN( domain=self.node_domain, id=LocalId(str(uuid4())), @@ -71,18 +91,34 @@ async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, + schema_id=schema_id_by_conv[key], metadata=draft.metadata, published_at=datetime.now(UTC), ) ) published = await self.record_repo.save_many(records) + + # Dual-write typed metadata projection in the same transaction. + # Group by schema_id — each schema has its own typed table. Use the + # rendered string as the dict key because SchemaId holds unhashable + # RootModel fields (LocalId, Semver). + by_schema: dict[str, tuple[SchemaId, list[tuple[RecordSRN, dict[str, Any]]]]] = {} + for r in published: + key = r.schema_id.render() + entry = by_schema.setdefault(key, (r.schema_id, [])) + entry[1].append((r.srn, r.metadata)) + for schema_id, typed_rows in by_schema.values(): + await self.metadata_service.insert_many(schema_id, typed_rows) + return published async def publish_record(self, draft: RecordDraft) -> Record: """Create and persist a Record from a draft.""" logger.info(f"Creating record from {draft.source.type} source: {draft.source.id}") + schema_id = await self._resolve_schema_id(draft.convention_srn) + record_srn = RecordSRN( domain=self.node_domain, id=LocalId(str(uuid4())), @@ -93,6 +129,7 @@ async def publish_record(self, draft: RecordDraft) -> Record: srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, + schema_id=schema_id, metadata=draft.metadata, published_at=datetime.now(UTC), ) @@ -100,11 +137,19 @@ async def publish_record(self, draft: RecordDraft) -> Record: await self.record_repo.save(record) logger.info(f"Record persisted: {record_srn}") + # Dual-write typed metadata projection in the same transaction. + await self.metadata_service.insert( + schema_id=schema_id, + record_srn=record_srn, + values=draft.metadata, + ) + published = RecordPublished( id=EventId(uuid4()), record_srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, + schema_id=schema_id, metadata=draft.metadata, expected_features=draft.expected_features, ) diff --git a/server/osa/domain/semantics/command/create_schema.py b/server/osa/domain/semantics/command/create_schema.py index 27c4202..ff6fcf2 100644 --- a/server/osa/domain/semantics/command/create_schema.py +++ b/server/osa/domain/semantics/command/create_schema.py @@ -6,17 +6,18 @@ from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.authorization.gate import at_least from osa.domain.shared.command import Command, CommandHandler, Result -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId, SchemaIdentifier class CreateSchema(Command): + id: SchemaIdentifier title: str version: str fields: list[FieldDefinition] class SchemaCreated(Result): - srn: SchemaSRN + id: SchemaId title: str field_count: int created_at: datetime @@ -29,12 +30,13 @@ class CreateSchemaHandler(CommandHandler[CreateSchema, SchemaCreated]): async def run(self, cmd: CreateSchema) -> SchemaCreated: schema = await self.schema_service.create_schema( + id=cmd.id, title=cmd.title, version=cmd.version, fields=cmd.fields, ) return SchemaCreated( - srn=schema.srn, + id=schema.id, title=schema.title, field_count=len(schema.fields), created_at=schema.created_at, diff --git a/server/osa/domain/semantics/model/schema.py b/server/osa/domain/semantics/model/schema.py index 1188dbf..20af362 100644 --- a/server/osa/domain/semantics/model/schema.py +++ b/server/osa/domain/semantics/model/schema.py @@ -3,13 +3,13 @@ from osa.domain.semantics.model.value import FieldDefinition from osa.domain.shared.error import ValidationError from osa.domain.shared.model.aggregate import Aggregate -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId class Schema(Aggregate): """An immutable, versioned definition of metadata structure.""" - srn: SchemaSRN + id: SchemaId title: str fields: list[FieldDefinition] created_at: datetime diff --git a/server/osa/domain/semantics/port/schema_repository.py b/server/osa/domain/semantics/port/schema_repository.py index b6849a8..1f49067 100644 --- a/server/osa/domain/semantics/port/schema_repository.py +++ b/server/osa/domain/semantics/port/schema_repository.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import TYPE_CHECKING, List, Protocol -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.port import Port if TYPE_CHECKING: @@ -13,7 +13,7 @@ class SchemaRepository(Port, Protocol): async def save(self, schema: "Schema") -> None: ... @abstractmethod - async def get(self, srn: SchemaSRN) -> "Schema | None": ... + async def get(self, schema_id: SchemaId) -> "Schema | None": ... @abstractmethod async def list( @@ -21,4 +21,4 @@ async def list( ) -> "List[Schema]": ... @abstractmethod - async def exists(self, srn: SchemaSRN) -> bool: ... + async def exists(self, schema_id: SchemaId) -> bool: ... diff --git a/server/osa/domain/semantics/query/get_schema.py b/server/osa/domain/semantics/query/get_schema.py index 3c7afd8..c52fb07 100644 --- a/server/osa/domain/semantics/query/get_schema.py +++ b/server/osa/domain/semantics/query/get_schema.py @@ -3,16 +3,16 @@ from osa.domain.semantics.model.value import FieldDefinition from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.authorization.gate import public -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.query import Query, QueryHandler, Result class GetSchema(Query): - srn: SchemaSRN + schema_id: SchemaId class SchemaDetail(Result): - srn: SchemaSRN + id: SchemaId title: str fields: list[FieldDefinition] created_at: datetime @@ -23,9 +23,9 @@ class GetSchemaHandler(QueryHandler[GetSchema, SchemaDetail]): schema_service: SchemaService async def run(self, cmd: GetSchema) -> SchemaDetail: - schema = await self.schema_service.get_schema(cmd.srn) + schema = await self.schema_service.get_schema(cmd.schema_id) return SchemaDetail( - srn=schema.srn, + id=schema.id, title=schema.title, fields=schema.fields, created_at=schema.created_at, diff --git a/server/osa/domain/semantics/query/list_schemas.py b/server/osa/domain/semantics/query/list_schemas.py index b13ba2e..1fe8410 100644 --- a/server/osa/domain/semantics/query/list_schemas.py +++ b/server/osa/domain/semantics/query/list_schemas.py @@ -4,7 +4,7 @@ from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.authorization.gate import public -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.query import Query, QueryHandler, Result @@ -13,7 +13,7 @@ class ListSchemas(Query): class SchemaSummary(BaseModel): - srn: SchemaSRN + id: SchemaId title: str field_count: int created_at: datetime @@ -32,7 +32,7 @@ async def run(self, cmd: ListSchemas) -> SchemaList: return SchemaList( items=[ SchemaSummary( - srn=s.srn, + id=s.id, title=s.title, field_count=len(s.fields), created_at=s.created_at, diff --git a/server/osa/domain/semantics/service/schema.py b/server/osa/domain/semantics/service/schema.py index 3127488..01cb341 100644 --- a/server/osa/domain/semantics/service/schema.py +++ b/server/osa/domain/semantics/service/schema.py @@ -1,12 +1,17 @@ from datetime import UTC, datetime -from uuid import uuid4 from osa.domain.semantics.model.schema import Schema from osa.domain.semantics.model.value import FieldDefinition, FieldType, TermConstraints from osa.domain.semantics.port.ontology_repository import OntologyRepository from osa.domain.semantics.port.schema_repository import SchemaRepository -from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import Domain, LocalId, SchemaSRN, Semver +from osa.domain.shared.error import ConflictError, NotFoundError, ValidationError +from osa.domain.shared.model.srn import ( + Domain, + LocalId, + SchemaId, + SchemaIdentifier, + Semver, +) from osa.domain.shared.service import Service @@ -17,6 +22,7 @@ class SchemaService(Service): async def create_schema( self, + id: SchemaIdentifier, title: str, version: str, fields: list[FieldDefinition], @@ -35,13 +41,18 @@ async def create_schema( f"(referenced by field '{field.name}')" ) - srn = SchemaSRN( - domain=self.node_domain, - id=LocalId(str(uuid4())[:20]), + schema_id = SchemaId( + id=LocalId(id.root), version=Semver.from_string(version), ) + existing = await self.schema_repo.get(schema_id) + if existing is not None: + raise ConflictError( + f"Schema already exists: {schema_id.render()}", + code="schema_already_exists", + ) schema = Schema( - srn=srn, + id=schema_id, title=title, fields=fields, created_at=datetime.now(UTC), @@ -49,10 +60,10 @@ async def create_schema( await self.schema_repo.save(schema) return schema - async def get_schema(self, srn: SchemaSRN) -> Schema: - schema = await self.schema_repo.get(srn) + async def get_schema(self, schema_id: SchemaId) -> Schema: + schema = await self.schema_repo.get(schema_id) if schema is None: - raise NotFoundError(f"Schema not found: {srn}") + raise NotFoundError(f"Schema not found: {schema_id}") return schema async def list_schemas( diff --git a/server/osa/domain/shared/error.py b/server/osa/domain/shared/error.py index a26c0d6..f03951b 100644 --- a/server/osa/domain/shared/error.py +++ b/server/osa/domain/shared/error.py @@ -34,8 +34,13 @@ class NotFoundError(DomainError): class ValidationError(DomainError): """Input validation failed.""" - def __init__(self, message: str, field: str | None = None) -> None: - super().__init__(message, code="VALIDATION_ERROR") + def __init__( + self, + message: str, + field: str | None = None, + code: str | None = None, + ) -> None: + super().__init__(message, code=code or "VALIDATION_ERROR") self.field = field diff --git a/server/osa/domain/shared/model/hook.py b/server/osa/domain/shared/model/hook.py index 346ba34..3691dbc 100644 --- a/server/osa/domain/shared/model/hook.py +++ b/server/osa/domain/shared/model/hook.py @@ -16,6 +16,14 @@ # Safe for use as PG identifiers, file path components, and env var values. PgIdentifier = Annotated[str, Field(pattern=r"^[a-z][a-z0-9_]{0,62}$")] +# Hook names compose into PG identifiers alongside fixed prefixes/suffixes — +# notably the per-hook FK constraint ``fk_features_{name}_record_srn`` (23 +# chars of overhead). PG's identifier limit is 63 chars, so cap hook names at +# 40 to keep every derived identifier inside the limit without surprise +# truncation. Column names use plain ``PgIdentifier`` because they don't get +# composed into longer names. +HookName = Annotated[str, Field(pattern=r"^[a-z][a-z0-9_]{0,39}$")] + _MEMORY_RE = re.compile(r"^(\d+(?:\.\d+)?)(g|m|k)?i?$") _GIB = 1024 * 1024 * 1024 @@ -57,7 +65,7 @@ def _format_memory(byte_count: int) -> str: class ColumnDef(ValueObject): - """Definition of a single column in a feature table.""" + """Definition of a single column in a feature or metadata table.""" name: PgIdentifier json_type: Literal["string", "number", "integer", "boolean", "array", "object"] @@ -115,7 +123,7 @@ class TableFeatureSpec(FeatureSpec): class HookDefinition(ValueObject): """Complete specification for a hook: how it runs + what it produces.""" - name: PgIdentifier + name: HookName runtime: Annotated[OciConfig, Field(discriminator="type")] feature: Annotated[TableFeatureSpec, Field(discriminator="kind")] diff --git a/server/osa/domain/shared/model/srn.py b/server/osa/domain/shared/model/srn.py index 5da69aa..13f97db 100644 --- a/server/osa/domain/shared/model/srn.py +++ b/server/osa/domain/shared/model/srn.py @@ -51,7 +51,30 @@ class LocalId(RootModel[str]): def _validate(cls, v: str) -> str: v = v.strip().lower() if not cls._re.match(v): - raise ValueError("invalid LocalId (20–64 chars, [a-z0-9-])") + raise ValueError("invalid LocalId (3–64 chars, [a-z0-9-])") + return v + + +class SchemaIdentifier(RootModel[str]): + """Human-readable schema slug. Narrower than :class:`LocalId`: + + - must start with a letter (so it can drive a PG table name without + quoting the leading character) + - 3–64 chars total, ``[a-z0-9-]`` + + Validated strictly (no case-folding / whitespace-stripping) — a typo like + ``"PDB-Structure"`` should surface loudly rather than silently normalise. + """ + + _re: ClassVar[re.Pattern] = re.compile(r"^[a-z][a-z0-9\-]{2,63}$") + + @field_validator("root") + @classmethod + def _validate(cls, v: str) -> str: + if not cls._re.match(v): + raise ValueError( + "invalid schema id: must be 3–64 chars of [a-z0-9-] and start with a letter" + ) return v @@ -280,3 +303,55 @@ class SnapshotSRN(SRN): class EventSRN(SRN): type: ResourceType = Field(default=ResourceType.evt, frozen=True) version: None = None + + +# ---------- Schema identity (short form — internal primitive) ---------- + + +class SchemaId(ValueObject): + """Short-form schema identity. The internal primitive for all non- + federation code paths. + + A schema is unambiguously identified by ``(id, version)`` within a single + OSA node — the publishing domain and resource-type segments of the full + :class:`SchemaSRN` URN carry no information at the internal layer (the + domain is always the node's own; the type is always ``schema``). + + Use :class:`SchemaSRN` only at federation edges (exports, snapshot + manifests, inter-node references) where the publishing node's domain + becomes meaningful. + + Wire form: ``"@"`` (e.g., ``"pdb-structure@1.0.0"``). + """ + + id: LocalId + version: Semver + + @property + def major(self) -> int: + """Major version component — the shared typed-table key.""" + return int(self.version.root.split(".")[0]) + + def render(self) -> str: + return f"{self.id.root}@{self.version.root}" + + def __str__(self) -> str: + return self.render() + + @classmethod + def parse(cls, value: str) -> "SchemaId": + """Parse wire form ``"@"``. + + Raises ``ValueError`` on malformed input. + """ + if not isinstance(value, str) or "@" not in value: + raise ValueError(f"SchemaId must be '@', got {value!r}") + id_part, version_part = value.split("@", 1) + return cls(id=LocalId(id_part), version=Semver.from_string(version_part)) + + @classmethod + def from_srn(cls, srn: "SchemaSRN") -> "SchemaId": + return cls(id=srn.id, version=srn.version) + + def to_srn(self, domain: Domain) -> "SchemaSRN": + return SchemaSRN(domain=domain, id=self.id, version=self.version) diff --git a/server/osa/infrastructure/event/di.py b/server/osa/infrastructure/event/di.py index fb665c4..3b05d40 100644 --- a/server/osa/infrastructure/event/di.py +++ b/server/osa/infrastructure/event/di.py @@ -37,6 +37,8 @@ CreateFeatureTables, InsertRecordFeatures, InsertBatchFeatures, + # Metadata projection is now synchronous (dual-write inside RecordService / + # ConventionService) — no event handlers required for it. # Ingest handlers RunIngester, RunHooks, diff --git a/server/osa/infrastructure/persistence/adapter/discovery.py b/server/osa/infrastructure/persistence/adapter/discovery.py index 23af2bc..be9d067 100644 --- a/server/osa/infrastructure/persistence/adapter/discovery.py +++ b/server/osa/infrastructure/persistence/adapter/discovery.py @@ -3,6 +3,8 @@ from __future__ import annotations import logging +from collections.abc import Callable +from datetime import date, datetime from typing import Any from sqlalchemy import ( @@ -11,36 +13,48 @@ String, and_, cast, + false, func, literal, + not_, or_, select, true, union_all, ) -from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.ext.asyncio import AsyncSession +from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef from osa.domain.discovery.model.value import ( + And, ColumnInfo, FeatureCatalogEntry, FeatureRow, - Filter, + FilterExpr, FilterOperator, + Not, + Or, + Predicate, RecordSummary, SortOrder, ) from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId from osa.infrastructure.persistence.feature_table import ( FeatureSchema, build_feature_table, data_columns, ) from osa.infrastructure.persistence.keyset import KeysetPage, SortKey +from osa.infrastructure.persistence.metadata_table import ( + MetadataSchema, + build_metadata_table, +) from osa.infrastructure.persistence.tables import ( feature_tables_table, + metadata_tables_table, records_table, schemas_table, ) @@ -53,19 +67,63 @@ def _escape_like(value: str) -> str: return value.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") -def _to_column_info(schema: FeatureSchema) -> list[ColumnInfo]: - """Map typed FeatureSchema columns to API-facing ColumnInfo list.""" - return [ColumnInfo(name=c.name, type=c.json_type, required=c.required) for c in schema.columns] +# Cursor-value coercers — cursor payloads round-trip through base64 JSON as +# plain strings/numbers, but keyset predicates compare against typed columns. +# Without this, ``published_at < 'iso-string'::VARCHAR`` fails on Postgres. + +CursorCoercer = Callable[[Any], Any] + + +def _coerce_identity(value: Any) -> Any: + return value + + +def _coerce_datetime(value: Any) -> Any: + if isinstance(value, str): + return datetime.fromisoformat(value) + return value + + +def _coerce_date(value: Any) -> Any: + if isinstance(value, str): + return date.fromisoformat(value) + return value + + +def _coerce_float(value: Any) -> Any: + return None if value is None else float(value) + + +def _coerce_int(value: Any) -> Any: + return None if value is None else int(value) + + +def _coercer_for_column(col_def: ColumnDef) -> CursorCoercer: + """Pick a coercer matching the Postgres type chosen by ``column_mapper``.""" + if col_def.json_type == "number": + return _coerce_float + if col_def.json_type == "integer": + return _coerce_int + if col_def.json_type == "string": + if col_def.format == "date-time": + return _coerce_datetime + if col_def.format == "date": + return _coerce_date + return _coerce_identity + + +def _to_column_info(columns: list[Any]) -> list[ColumnInfo]: + return [ColumnInfo(name=c.name, type=c.json_type, required=c.required) for c in columns] class PostgresFieldDefinitionReader: - """Builds a global field_name -> FieldType map from all registered schemas.""" + """Builds field name → FieldType maps from registered schemas.""" def __init__(self, session: AsyncSession) -> None: self.session = session async def get_all_field_types(self) -> dict[str, FieldType]: - stmt = select(schemas_table.c.srn, schemas_table.c.fields) + stmt = select(schemas_table.c.fields) result = await self.session.execute(stmt) rows = result.mappings().all() @@ -84,16 +142,29 @@ async def get_all_field_types(self) -> dict[str, FieldType]: return field_map + async def get_fields_for_schema(self, schema_id: SchemaId) -> dict[str, FieldType]: + stmt = select(schemas_table.c.fields).where( + schemas_table.c.id == schema_id.id.root, + schemas_table.c.version == schema_id.version.root, + ) + result = await self.session.execute(stmt) + row = result.mappings().first() + if row is None: + return {} + return {f["name"]: FieldType(f["type"]) for f in row["fields"]} + class PostgresDiscoveryReadStore: - """Direct SQL queries against records and feature tables for discovery.""" + """Compiles FilterExpr trees into SQLAlchemy queries over records / metadata / features.""" def __init__(self, session: AsyncSession) -> None: self.session = session async def search_records( self, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_id: SchemaId | None, + convention_srn: ConventionSRN | None, text_fields: list[str], q: str | None, sort: str, @@ -102,34 +173,65 @@ async def search_records( limit: int, field_types: dict[str, FieldType] | None = None, ) -> list[RecordSummary]: - """Build and execute a dynamic SQL query for record search.""" t = records_table + ft_map = field_types or {} + + metadata_table = None + metadata_schema: MetadataSchema | None = None + if schema_id is not None: + catalog = await self._metadata_catalog_for(schema_id) + if catalog is not None: + metadata_schema = MetadataSchema.model_validate(catalog["metadata_schema"]) + metadata_table = build_metadata_table(catalog["pg_table"], metadata_schema) + + feature_joins = await self._collect_feature_joins(filter_expr) + conditions: list[Any] = [] - ft = field_types or {} - # Build filter conditions - for f in filters: - conditions.append(self._record_filter_clause(f, ft.get(f.field))) + if convention_srn is not None: + conditions.append(t.c.convention_srn == str(convention_srn)) + + if filter_expr is not None: + conditions.append( + self._compile_filter_for_records( + filter_expr, + records_t=t, + metadata_t=metadata_table, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + ) - # Free-text search across text fields - if q and text_fields: + if q and text_fields and metadata_table is not None and metadata_schema is not None: pattern = f"%{_escape_like(q)}%" + text_col_names = {c.name for c in metadata_schema.columns if c.json_type == "string"} text_clauses = [ - t.c.metadata[field].astext.ilike(pattern, escape="\\") for field in text_fields + cast(metadata_table.c[name], String).ilike(pattern, escape="\\") + for name in text_fields + if name in text_col_names ] - conditions.append(or_(*text_clauses)) + if text_clauses: + conditions.append(or_(*text_clauses)) - # Determine sort expression (cast to match field type for correct ordering) + # Sort expression + matching cursor-value coercer if sort == "published_at": sort_expr = t.c.published_at - elif ft.get(sort) == FieldType.NUMBER: - sort_expr = cast(t.c.metadata[sort].astext, Float) - elif ft.get(sort) == FieldType.DATE: - sort_expr = cast(t.c.metadata[sort].astext, Date) + coerce_cursor: CursorCoercer = _coerce_datetime + elif metadata_table is not None and sort in metadata_table.c: + col = metadata_table.c[sort] + if ft_map.get(sort) == FieldType.NUMBER: + sort_expr = cast(col, Float) + coerce_cursor = _coerce_float + elif ft_map.get(sort) == FieldType.DATE: + sort_expr = cast(col, Date) + coerce_cursor = _coerce_date + else: + sort_expr = col + coerce_cursor = _coerce_identity else: - sort_expr = t.c.metadata[sort].astext + sort_expr = t.c.published_at + coerce_cursor = _coerce_datetime - # Keyset pagination with correct NULL handling is_desc = order == SortOrder.DESC page = KeysetPage( [ @@ -138,31 +240,55 @@ async def search_records( ] ) order_clauses = page.order_by() - if cursor is not None: - conditions.append(page.after((cursor["s"], cursor["id"]))) + sort_value = coerce_cursor(cursor["s"]) + conditions.append(page.after((sort_value, cursor["id"]))) where_clause = and_(*conditions) if conditions else true() - stmt = ( - select(t.c.srn, t.c.published_at, t.c.metadata) - .where(where_clause) - .order_by(*order_clauses) - .limit(limit) - ) + if metadata_table is not None and metadata_schema is not None: + select_cols = [t.c.srn, t.c.published_at] + [ + metadata_table.c[c.name].label(c.name) for c in metadata_schema.columns + ] + stmt = select(*select_cols).select_from( + t.join(metadata_table, metadata_table.c.record_srn == t.c.srn) + ) + else: + # No schema pinned — project the canonical JSONB metadata column. + # Typed tables are a query-optimized projection; JSONB remains the + # authoritative source for presentation (and for cross-schema + # listings where no single typed table applies). + stmt = select(t.c.srn, t.c.published_at, t.c.metadata) + + for hook, ft in feature_joins.items(): + stmt = stmt.join(ft, ft.c.record_srn == t.c.srn, isouter=True) + + stmt = stmt.where(where_clause).order_by(*order_clauses).limit(limit) result = await self.session.execute(stmt) - return [ - RecordSummary( - srn=RecordSRN.parse(row["srn"]), - published_at=row["published_at"], - metadata=row["metadata"], - ) - for row in result.mappings() - ] + summaries: list[RecordSummary] = [] + if metadata_table is not None and metadata_schema is not None: + for row in result.mappings(): + meta = {c.name: row[c.name] for c in metadata_schema.columns if c.name in row} + summaries.append( + RecordSummary( + srn=RecordSRN.parse(row["srn"]), + published_at=row["published_at"], + metadata=meta, + ) + ) + else: + for row in result.mappings(): + summaries.append( + RecordSummary( + srn=RecordSRN.parse(row["srn"]), + published_at=row["published_at"], + metadata=row.get("metadata") or {}, + ) + ) + return summaries async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: - """List all feature tables with column schemas and record counts.""" stmt = select( feature_tables_table.c.hook_name, feature_tables_table.c.pg_table, @@ -174,16 +300,14 @@ async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: if not catalog_rows: return [] - # Parse schemas at the boundary parsed = [ - (row["hook_name"], FeatureSchema.model_validate(row["feature_schema"]), row["pg_table"]) + (row["hook_name"], FeatureSchema.model_validate(row["feature_schema"])) for row in catalog_rows ] - # Fetch all record counts in a single UNION ALL query (avoid N+1) count_parts = [] - for hook_name, schema, pg_table in parsed: - ft = build_feature_table(pg_table, schema) + for hook_name, schema in parsed: + ft = build_feature_table(hook_name, schema) count_parts.append( select( literal(hook_name).label("hook_name"), @@ -196,14 +320,13 @@ async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: return [ FeatureCatalogEntry( hook_name=hook_name, - columns=_to_column_info(schema), + columns=_to_column_info(schema.columns), record_count=counts_by_hook.get(hook_name, 0), ) - for hook_name, schema, _pg_table in parsed + for hook_name, schema in parsed ] async def get_feature_table_schema(self, hook_name: str) -> FeatureCatalogEntry | None: - """Look up a single feature table's schema by hook name.""" stmt = select( feature_tables_table.c.hook_name, feature_tables_table.c.feature_schema, @@ -216,62 +339,74 @@ async def get_feature_table_schema(self, hook_name: str) -> FeatureCatalogEntry schema = FeatureSchema.model_validate(row["feature_schema"]) return FeatureCatalogEntry( hook_name=row["hook_name"], - columns=_to_column_info(schema), + columns=_to_column_info(schema.columns), record_count=0, ) async def search_features( self, hook_name: str, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_id: SchemaId | None, record_srn: RecordSRN | None, sort: str, order: SortOrder, cursor: dict[str, Any] | None, limit: int, ) -> list[FeatureRow]: - """Build and execute a dynamic SQL query for feature row search.""" - # Look up pg_table and feature_schema from catalog pg_table_stmt = select( - feature_tables_table.c.pg_table, feature_tables_table.c.feature_schema, ).where(feature_tables_table.c.hook_name == hook_name) pg_result = await self.session.execute(pg_table_stmt) pg_row = pg_result.mappings().first() if pg_row is None: return [] - pg_table: str = pg_row["pg_table"] schema = FeatureSchema.model_validate(pg_row["feature_schema"]) - ft = build_feature_table(pg_table, schema) + ft = build_feature_table(hook_name, schema) + + metadata_table = None + metadata_schema: MetadataSchema | None = None + if schema_id is not None: + catalog = await self._metadata_catalog_for(schema_id) + if catalog is not None: + metadata_schema = MetadataSchema.model_validate(catalog["metadata_schema"]) + metadata_table = build_metadata_table(catalog["pg_table"], metadata_schema) + + feature_joins: dict[str, Any] = {} + if filter_expr is not None: + extra = await self._collect_feature_joins(filter_expr) + for hook, tbl in extra.items(): + if hook != hook_name: + feature_joins[hook] = tbl conditions: list[Any] = [] - # Record SRN filter if record_srn is not None: conditions.append(ft.c.record_srn == str(record_srn)) - # Column filters — all columns are known from schema - for f in filters: - col = ft.c[f.field] - if f.operator == FilterOperator.EQ: - conditions.append(col == f.value) - elif f.operator == FilterOperator.CONTAINS: - conditions.append( - cast(col, String).ilike(f"%{_escape_like(str(f.value))}%", escape="\\") + if filter_expr is not None: + conditions.append( + self._compile_filter_for_features( + filter_expr, + this_hook=hook_name, + this_ft=ft, + metadata_t=metadata_table, + metadata_schema=metadata_schema, + feature_joins=feature_joins, ) - elif f.operator == FilterOperator.GTE: - conditions.append(col >= f.value) - elif f.operator == FilterOperator.LTE: - conditions.append(col <= f.value) + ) - # Sort expression if sort == "id": sort_expr = ft.c.id + coerce_cursor: CursorCoercer = _coerce_int else: sort_expr = ft.c[sort] + col_def = next((c for c in schema.columns if c.name == sort), None) + coerce_cursor = ( + _coercer_for_column(col_def) if col_def is not None else _coerce_identity + ) - # Keyset pagination with correct NULL handling is_desc = order == SortOrder.DESC page = KeysetPage( [ @@ -280,17 +415,24 @@ async def search_features( ] ) order_clauses = page.order_by() - if cursor is not None: - conditions.append(page.after((cursor["s"], cursor["id"]))) + sort_value = coerce_cursor(cursor["s"]) + conditions.append(page.after((sort_value, cursor["id"]))) where_clause = and_(*conditions) if conditions else true() + stmt = select(ft.c.id, ft.c.record_srn, *data_columns(ft)) + select_from = ft + if metadata_table is not None: + select_from = select_from.join( + metadata_table, metadata_table.c.record_srn == ft.c.record_srn, isouter=True + ) + for hook, other_ft in feature_joins.items(): + select_from = select_from.join( + other_ft, other_ft.c.record_srn == ft.c.record_srn, isouter=True + ) stmt = ( - select(ft.c.id, ft.c.record_srn, *data_columns(ft)) - .where(where_clause) - .order_by(*order_clauses) - .limit(limit) + stmt.select_from(select_from).where(where_clause).order_by(*order_clauses).limit(limit) ) result = await self.session.execute(stmt) @@ -303,31 +445,257 @@ async def search_features( return feature_rows - @staticmethod - def _record_filter_clause(f: Filter, field_type: FieldType | None = None) -> Any: - """Build a SQL clause for a single record metadata filter.""" - t = records_table - if f.operator == FilterOperator.EQ: - # Use JSONB @> containment (GIN-indexed) - return t.c.metadata.op("@>")(cast(func.json_build_object(f.field, f.value), JSONB)) - elif f.operator == FilterOperator.CONTAINS: - return t.c.metadata[f.field].astext.ilike( - f"%{_escape_like(str(f.value))}%", escape="\\" + # ---------------- compilation helpers ---------------- + + async def _metadata_catalog_for(self, schema_id: SchemaId) -> dict[str, Any] | None: + """Look up the metadata table catalog row for a SchemaId.""" + stmt = select(metadata_tables_table).where( + metadata_tables_table.c.schema_id == schema_id.id.root, + metadata_tables_table.c.schema_major == schema_id.major, + ) + result = await self.session.execute(stmt) + row = result.mappings().first() + return dict(row) if row is not None else None + + async def _collect_feature_joins(self, filter_expr: FilterExpr | None) -> dict[str, Any]: + """Build {hook_name: SQLA Table} for every distinct feature ref in the tree.""" + if filter_expr is None: + return {} + hooks: set[str] = set() + for p in _iter_predicates(filter_expr): + if isinstance(p.field, FeatureFieldRef): + hooks.add(p.field.hook) + if not hooks: + return {} + stmt = select( + feature_tables_table.c.hook_name, + feature_tables_table.c.feature_schema, + ).where(feature_tables_table.c.hook_name.in_(hooks)) + result = await self.session.execute(stmt) + joins: dict[str, Any] = {} + for row in result.mappings(): + schema = FeatureSchema.model_validate(row["feature_schema"]) + joins[row["hook_name"]] = build_feature_table(row["hook_name"], schema) + missing = hooks - joins.keys() + if missing: + raise ValidationError( + f"Unknown feature hook(s): {sorted(missing)}", + field="filter", + code="unknown_hook", ) - elif f.operator in (FilterOperator.GTE, FilterOperator.LTE): - # Use typed casts: numeric for NUMBER, date for DATE, string fallback - if field_type == FieldType.NUMBER: - col_expr = cast(t.c.metadata[f.field].astext, Float) - val = float(f.value) - elif field_type == FieldType.DATE: - col_expr = cast(t.c.metadata[f.field].astext, Date) - val = str(f.value) - else: - col_expr = cast(t.c.metadata[f.field].astext, String) - val = str(f.value) - if f.operator == FilterOperator.GTE: - return col_expr >= val + return joins + + def _compile_filter_for_records( + self, + expr: FilterExpr, + *, + records_t: Any, + metadata_t: Any, + metadata_schema: MetadataSchema | None, + feature_joins: dict[str, Any], + ) -> Any: + if isinstance(expr, Predicate): + return self._compile_predicate( + expr, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + if isinstance(expr, And): + return and_( + *[ + self._compile_filter_for_records( + op, + records_t=records_t, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Or): + return or_( + *[ + self._compile_filter_for_records( + op, + records_t=records_t, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Not): + inner = self._compile_filter_for_records( + expr.operand, + records_t=records_t, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + # Coalesce NULL → FALSE before negating so records with NULL + # feature/metadata values (including rows missing from outer- + # joined feature tables) survive a NOT predicate. Without this, + # ``NOT (score = 5)`` reads NULL for records with no score and + # three-valued logic silently drops them. + return not_(func.coalesce(inner, false())) + raise ValidationError(f"Unsupported filter node: {type(expr).__name__}") + + def _compile_filter_for_features( + self, + expr: FilterExpr, + *, + this_hook: str, + this_ft: Any, + metadata_t: Any, + metadata_schema: MetadataSchema | None, + feature_joins: dict[str, Any], + ) -> Any: + if isinstance(expr, Predicate): + if isinstance(expr.field, MetadataFieldRef): + if metadata_t is None: + raise ValidationError( + f"Metadata ref {expr.field.dotted()!r} requires schema_id to be set.", + field=expr.field.dotted(), + code="metadata_ref_requires_schema", + ) + col = metadata_t.c[expr.field.field] + return _apply_scalar_op(col, expr.op, expr.value) + if not isinstance(expr.field, FeatureFieldRef): + raise TypeError(f"Unexpected field ref type: {type(expr.field).__name__}") + if expr.field.hook == this_hook: + col = this_ft.c[expr.field.column] else: - return col_expr <= val - else: - raise ValueError(f"Unknown operator: {f.operator}") # pragma: no cover + tbl = feature_joins.get(expr.field.hook) + if tbl is None: + raise ValidationError( + f"Unknown feature hook '{expr.field.hook}'.", + field=expr.field.dotted(), + code="unknown_hook", + ) + col = tbl.c[expr.field.column] + return _apply_scalar_op(col, expr.op, expr.value) + if isinstance(expr, And): + return and_( + *[ + self._compile_filter_for_features( + op, + this_hook=this_hook, + this_ft=this_ft, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Or): + return or_( + *[ + self._compile_filter_for_features( + op, + this_hook=this_hook, + this_ft=this_ft, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Not): + inner = self._compile_filter_for_features( + expr.operand, + this_hook=this_hook, + this_ft=this_ft, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + # See ``_compile_filter_for_records`` — NULL → FALSE coalesce so + # NOT over outer-joined feature / optional metadata columns + # includes records with missing values. + return not_(func.coalesce(inner, false())) + raise ValidationError(f"Unsupported filter node: {type(expr).__name__}") + + def _compile_predicate( + self, + predicate: Predicate, + *, + metadata_t: Any, + metadata_schema: MetadataSchema | None, + feature_joins: dict[str, Any], + ) -> Any: + if isinstance(predicate.field, MetadataFieldRef): + if metadata_t is None or metadata_schema is None: + raise ValidationError( + f"Metadata predicate on {predicate.field.dotted()!r} requires " + "the request to pin a 'schema' ('@'). " + "Unscoped metadata filtering is not supported — the typed table " + "is the only filter path.", + field=predicate.field.dotted(), + code="schema_required_for_metadata_query", + ) + col = metadata_t.c[predicate.field.field] + return _apply_scalar_op(col, predicate.op, predicate.value) + + if not isinstance(predicate.field, FeatureFieldRef): + raise TypeError(f"Unexpected field ref type: {type(predicate.field).__name__}") + tbl = feature_joins.get(predicate.field.hook) + if tbl is None: + raise ValidationError( + f"Unknown feature hook '{predicate.field.hook}'.", + field=predicate.field.dotted(), + code="unknown_hook", + ) + col = tbl.c[predicate.field.column] + return _apply_scalar_op(col, predicate.op, predicate.value) + + +def _apply_scalar_op(col: Any, op: FilterOperator, value: Any) -> Any: + if op == FilterOperator.EQ: + return col == value + if op == FilterOperator.NEQ: + # Feature tables are outer-joined, so a missing feature row makes + # ``col`` NULL. Plain ``col != value`` yields NULL (falsy) and + # silently excludes those records from the result. Users reading + # ``!= X`` expect "anything except X, including missing", so treat + # NULL as non-equal explicitly. + return or_(col != value, col.is_(None)) + if op == FilterOperator.GT: + return col > value + if op == FilterOperator.GTE: + return col >= value + if op == FilterOperator.LT: + return col < value + if op == FilterOperator.LTE: + return col <= value + if op == FilterOperator.IN: + if not isinstance(value, list): + raise ValidationError( + "Operator 'in' requires a list value.", + field=col.key, + code="invalid_value_for_op", + ) + return col.in_(value) + if op == FilterOperator.CONTAINS: + return cast(col, String).ilike(f"%{_escape_like(str(value))}%", escape="\\") + if op == FilterOperator.IS_NULL: + return col.is_(None) + raise ValidationError( + f"Unsupported operator: {op}", field="filter", code="unsupported_operator" + ) + + +def _iter_predicates(expr: FilterExpr): + if isinstance(expr, Predicate): + yield expr + return + if isinstance(expr, Not): + yield from _iter_predicates(expr.operand) + return + if isinstance(expr, (And, Or)): + for op in expr.operands: + yield from _iter_predicates(op) diff --git a/server/osa/infrastructure/persistence/adapter/readers.py b/server/osa/infrastructure/persistence/adapter/readers.py index 36d066f..a871bf0 100644 --- a/server/osa/infrastructure/persistence/adapter/readers.py +++ b/server/osa/infrastructure/persistence/adapter/readers.py @@ -12,7 +12,7 @@ from osa.domain.semantics.model.ontology import Ontology, Term from osa.domain.semantics.model.schema import Schema from osa.domain.semantics.model.value import FieldDefinition -from osa.domain.shared.model.srn import OntologySRN, SchemaSRN +from osa.domain.shared.model.srn import LocalId, OntologySRN, SchemaId, Semver from osa.infrastructure.persistence.tables import ( ontologies_table, ontology_terms_table, @@ -20,12 +20,18 @@ ) +def _where_schema(schema_id: SchemaId): + return (schemas_table.c.id == schema_id.id.root) & ( + schemas_table.c.version == schema_id.version.root + ) + + class SchemaReaderAdapter(SchemaReader): def __init__(self, session: AsyncSession) -> None: self.session = session - async def get_schema(self, srn: SchemaSRN) -> Schema | None: - stmt = select(schemas_table).where(schemas_table.c.srn == str(srn)) + async def get_schema(self, schema_id: SchemaId) -> Schema | None: + stmt = select(schemas_table).where(_where_schema(schema_id)) result = await self.session.execute(stmt) row = result.mappings().first() if not row: @@ -33,14 +39,17 @@ async def get_schema(self, srn: SchemaSRN) -> Schema | None: row_dict = dict(row) fields = [FieldDefinition.model_validate(f) for f in row_dict["fields"]] return Schema( - srn=SchemaSRN.parse(row_dict["srn"]), + id=SchemaId( + id=LocalId(row_dict["id"]), + version=Semver.from_string(row_dict["version"]), + ), title=row_dict["title"], fields=fields, created_at=row_dict["created_at"], ) - async def schema_exists(self, srn: SchemaSRN) -> bool: - stmt = select(schemas_table.c.srn).where(schemas_table.c.srn == str(srn)) + async def schema_exists(self, schema_id: SchemaId) -> bool: + stmt = select(schemas_table.c.id).where(_where_schema(schema_id)) result = await self.session.execute(stmt) return result.first() is not None diff --git a/server/osa/infrastructure/persistence/api_naming.py b/server/osa/infrastructure/persistence/api_naming.py new file mode 100644 index 0000000..644f525 --- /dev/null +++ b/server/osa/infrastructure/persistence/api_naming.py @@ -0,0 +1,46 @@ +"""API-to-storage naming translation. + +The API surface and the PG storage layout coincidentally share names today — +feature references in the discovery wire format say ``features..`` +and that maps cleanly onto ``features."".`` in PostgreSQL. The +metadata tables in the ``metadata`` PG schema likewise mirror the API's +``metadata.`` prefix. + +This module is the seam between the API and the storage layer. Callers route +through these functions so that if the API naming ever needs to diverge from +the PG layout (API rename; storage consolidation; federation-driven rename), +the translation lives here rather than being sprinkled through adapters and +stores. + +All functions are identity implementations today. The point is to *mark the +boundary* so it is crossable later, not to make the names different now. +""" + +from __future__ import annotations + + +def feature_pg_schema() -> str: + """PG schema name holding dynamic feature tables. + + Mirrors the API's ``features.*`` prefix today. + """ + return "features" + + +def feature_pg_table(api_feature_name: str) -> str: + """PG table name for a feature referenced by its API name. + + The ```` segment of the API path ``features..`` maps + to this PG table name. Identity today — the API and PG names are + intentionally aligned for readability. Introduce a real mapping here if + the two ever diverge. + """ + return api_feature_name + + +def metadata_pg_schema() -> str: + """PG schema name holding dynamic per-schema metadata tables. + + Mirrors the API's ``metadata.*`` prefix today. + """ + return "metadata" diff --git a/server/osa/infrastructure/persistence/column_mapper.py b/server/osa/infrastructure/persistence/column_mapper.py index 69c7dc9..a9b463c 100644 --- a/server/osa/infrastructure/persistence/column_mapper.py +++ b/server/osa/infrastructure/persistence/column_mapper.py @@ -27,7 +27,6 @@ def map_column(col_def: ColumnDef) -> sa.Column: type_factory = _TYPE_MAP.get(key) if type_factory is None: - # Fall back to base type without format type_factory = _TYPE_MAP.get((col_def.json_type, None), sa.Text) sa_type = type_factory() diff --git a/server/osa/infrastructure/persistence/di.py b/server/osa/infrastructure/persistence/di.py index 569945f..12873e2 100644 --- a/server/osa/infrastructure/persistence/di.py +++ b/server/osa/infrastructure/persistence/di.py @@ -12,6 +12,7 @@ from osa.domain.deposition.port.repository import DepositionRepository from osa.domain.deposition.port.schema_reader import SchemaReader from osa.domain.deposition.port.storage import FileStoragePort +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.record.port.feature_reader import FeatureReader from osa.domain.record.port.repository import RecordRepository from osa.domain.record.query.get_record import GetRecordHandler @@ -60,6 +61,8 @@ PostgresSemanticsSchemaRepository, ) from osa.infrastructure.persistence.feature_store import PostgresFeatureStore +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.domain.metadata.port.metadata_store import MetadataStore from osa.infrastructure.persistence.repository.validation import ( PostgresValidationRunRepository, ) @@ -102,6 +105,11 @@ async def get_session( def get_feature_store(self, engine: AsyncEngine, session: AsyncSession) -> FeatureStore: return PostgresFeatureStore(engine=engine, session=session) + # Metadata store + @provide(scope=Scope.UOW) + def get_metadata_store(self, engine: AsyncEngine, session: AsyncSession) -> MetadataStore: + return PostgresMetadataStore(engine=engine, session=session) + # Semantics repositories ontology_repo = provide( PostgresOntologyRepository, scope=Scope.UOW, provides=OntologyRepository @@ -146,6 +154,8 @@ def get_feature_storage(self, file_storage: FileStoragePort) -> FeatureStoragePo def get_record_service( self, record_repo: RecordRepository, + convention_repo: ConventionRepository, + metadata_service: MetadataService, outbox: Outbox, config: Config, feature_reader: FeatureReader, @@ -156,6 +166,8 @@ def get_record_service( """ return RecordService( record_repo=record_repo, + convention_repo=convention_repo, + metadata_service=metadata_service, outbox=outbox, node_domain=Domain(config.domain), feature_reader=feature_reader, diff --git a/server/osa/infrastructure/persistence/feature_store.py b/server/osa/infrastructure/persistence/feature_store.py index b43c729..9aa7e0f 100644 --- a/server/osa/infrastructure/persistence/feature_store.py +++ b/server/osa/infrastructure/persistence/feature_store.py @@ -12,8 +12,8 @@ from osa.domain.feature.port.feature_store import FeatureStore from osa.domain.shared.error import ConflictError, ValidationError from osa.domain.shared.model.hook import ColumnDef +from osa.infrastructure.persistence.api_naming import feature_pg_schema, feature_pg_table from osa.infrastructure.persistence.feature_table import ( - FEATURES_SCHEMA, FeatureSchema, build_feature_table, ) @@ -48,7 +48,7 @@ async def create_table(self, hook_name: str, columns: list[ColumnDef]) -> None: async with self._engine.begin() as conn: # Ensure the features schema exists - await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{FEATURES_SCHEMA}"')) + await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{feature_pg_schema()}"')) # Check for existing table in catalog — duplicate is a hard error existing = await conn.execute( @@ -63,12 +63,12 @@ async def create_table(self, hook_name: str, columns: list[ColumnDef]) -> None: schema = FeatureSchema(columns=columns) table = build_feature_table(hook_name, schema) - # Create table + # Create table (FK to records.srn is declared inline on the column) await conn.run_sync(table.metadata.create_all, checkfirst=False) await conn.execute( feature_tables_table.insert().values( hook_name=hook_name, - pg_table=hook_name, + pg_table=feature_pg_table(hook_name), feature_schema=schema.model_dump(), schema_version=1, created_at=datetime.now(UTC), @@ -99,11 +99,13 @@ async def insert_features( # Bulk insert in chunks of 1000 chunk_size = 1000 total = 0 + pg_schema = feature_pg_schema() + pg_table = feature_pg_table(hook_name) async with self._engine.begin() as conn: # Reflect the actual table to get correct column types for casts - metadata = sa.MetaData(schema=FEATURES_SCHEMA) - await conn.run_sync(metadata.reflect, only=[hook_name]) - table = metadata.tables[f"{FEATURES_SCHEMA}.{hook_name}"] + metadata = sa.MetaData(schema=pg_schema) + await conn.run_sync(metadata.reflect, only=[pg_table]) + table = metadata.tables[f"{pg_schema}.{pg_table}"] for i in range(0, len(enriched_rows), chunk_size): chunk = enriched_rows[i : i + chunk_size] diff --git a/server/osa/infrastructure/persistence/feature_table.py b/server/osa/infrastructure/persistence/feature_table.py index 48eb214..eff08c8 100644 --- a/server/osa/infrastructure/persistence/feature_table.py +++ b/server/osa/infrastructure/persistence/feature_table.py @@ -6,9 +6,13 @@ from osa.domain.shared.model.hook import ColumnDef from osa.domain.shared.model.value import ValueObject +from osa.infrastructure.persistence.api_naming import feature_pg_schema, feature_pg_table from osa.infrastructure.persistence.column_mapper import map_column +from osa.infrastructure.persistence.tables import records_table -FEATURES_SCHEMA = "features" +# Back-compat re-export for callers that import the constant directly. +# Prefer ``feature_pg_schema()`` in new code. +FEATURES_SCHEMA = feature_pg_schema() AUTO_COLUMN_NAMES = frozenset({"id", "record_srn", "created_at"}) @@ -22,24 +26,35 @@ class FeatureSchema(ValueObject): columns: list[ColumnDef] = [] -def build_feature_table(pg_table: str, schema: FeatureSchema) -> sa.Table: +def build_feature_table(api_feature_name: str, schema: FeatureSchema) -> sa.Table: """Build a SQLAlchemy ``Table`` for a dynamic feature table. + *api_feature_name* is the ```` segment from the API's + ``features..`` path. The PG table name is resolved through + :func:`feature_pg_table` — identity today but kept as a seam. + Returns a ``Table`` with auto columns (``id``, ``record_srn``, ``created_at``) - plus data columns derived from *schema* via :func:`map_column`, in the - ``features`` PG schema. + plus data columns derived from *schema*, in the features PG schema. - Each call creates a disposable ``MetaData`` — these Tables are used for - query building only, not for DDL lifecycle management. + ``record_srn`` carries an ``ON DELETE CASCADE`` FK to ``records.srn`` — the + FK target is the ``Column`` object itself (not a string reference), so + SQLAlchemy resolves it without requiring ``records`` to live in the same + disposable ``MetaData`` as the dynamic table. """ data_columns = [map_column(col_def) for col_def in schema.columns] metadata = sa.MetaData() return sa.Table( - pg_table, + feature_pg_table(api_feature_name), metadata, sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True), - sa.Column("record_srn", sa.Text, nullable=False, index=True), + sa.Column( + "record_srn", + sa.Text, + sa.ForeignKey(records_table.c.srn, ondelete="CASCADE"), + nullable=False, + index=True, + ), sa.Column( "created_at", sa.DateTime(timezone=True), @@ -47,7 +62,7 @@ def build_feature_table(pg_table: str, schema: FeatureSchema) -> sa.Table: server_default=sa.func.now(), ), *data_columns, - schema=FEATURES_SCHEMA, + schema=feature_pg_schema(), ) diff --git a/server/osa/infrastructure/persistence/mappers/record.py b/server/osa/infrastructure/persistence/mappers/record.py index 97ad7db..a916d75 100644 --- a/server/osa/infrastructure/persistence/mappers/record.py +++ b/server/osa/infrastructure/persistence/mappers/record.py @@ -1,4 +1,11 @@ -"""Record mapper - converts between domain and persistence.""" +"""Record mapper - converts between domain and persistence. + +Feature 076 adds ``schema_id`` + ``schema_version`` columns so a Record's +typed linkage is first-class. ``metadata`` remains the canonical JSONB store; +the typed ``metadata._v`` table is a discovery-optimized +projection written synchronously alongside ``records`` in the same UoW +transaction by ``RecordService.publish_record`` / ``bulk_publish``. +""" from datetime import datetime from typing import Any @@ -7,7 +14,7 @@ from osa.domain.record.model.aggregate import Record from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, LocalId, RecordSRN, SchemaId, Semver _source_adapter = TypeAdapter(RecordSource) @@ -24,7 +31,11 @@ def row_to_record(row: dict[str, Any]) -> Record: srn=RecordSRN.parse(row["srn"]), source=source, convention_srn=ConventionSRN.parse(row["convention_srn"]), - metadata=row.get("metadata", {}), + schema_id=SchemaId( + id=LocalId(row["schema_id"]), + version=Semver.from_string(row["schema_version"]), + ), + metadata=row.get("metadata") or {}, published_at=published_at, ) @@ -34,6 +45,8 @@ def record_to_dict(record: Record) -> dict[str, Any]: return { "srn": str(record.srn), "convention_srn": str(record.convention_srn), + "schema_id": record.schema_id.id.root, + "schema_version": record.schema_id.version.root, "source": _source_adapter.dump_python(record.source, mode="json"), "metadata": record.metadata, "published_at": record.published_at, diff --git a/server/osa/infrastructure/persistence/metadata_store.py b/server/osa/infrastructure/persistence/metadata_store.py new file mode 100644 index 0000000..b7996d1 --- /dev/null +++ b/server/osa/infrastructure/persistence/metadata_store.py @@ -0,0 +1,378 @@ +"""PostgreSQL implementation of MetadataStore. + +Schema-keyed DDL lifecycle: one metadata table per ``(schema_id, major)`` +pair. The catalog row in ``public.metadata_tables`` is updated in lock-step +with ALTER ADD COLUMN operations so reads can reconstruct the dynamic table +shape without reflection. +""" + +from __future__ import annotations + +import re +from datetime import UTC, date, datetime +from typing import Any, Literal, Sequence + +import sqlalchemy as sa +from sqlalchemy import select, text +from sqlalchemy.dialects.postgresql import insert +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.metadata.port.metadata_store import MetadataStore +from osa.domain.semantics.model.value import FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.infrastructure.persistence.api_naming import metadata_pg_schema +from osa.infrastructure.persistence.column_mapper import map_column +from osa.infrastructure.persistence.metadata_table import ( + MetadataSchema, + build_metadata_table, + check_pg_table_name, + schema_slug, +) +from osa.infrastructure.persistence.tables import metadata_tables_table + + +_JsonType = Literal["string", "number", "integer", "boolean", "array", "object"] + +# Defense-in-depth: validate any string interpolated into a raw DDL statement. +# ``ColumnDef.name`` is declared as ``PgIdentifier`` at the Pydantic layer but +# we re-check here because a) catalog rows round-trip through JSON and a bad +# actor with write access to metadata_tables could smuggle a malicious name +# through, and b) this function's contract should not rely on upstream +# validators that might be refactored away. +_PG_IDENT_RE = re.compile(r"^[a-z][a-z0-9_]{0,62}$") + + +def _safe_ident(name: str) -> str: + if not _PG_IDENT_RE.match(name): + raise ValidationError(f"Refusing to interpolate unsafe PG identifier {name!r} into DDL") + return name + + +_JSON_TYPE_MAP: dict[FieldType, tuple[_JsonType | None, str | None]] = { + FieldType.TEXT: ("string", None), + FieldType.URL: ("string", None), + FieldType.TERM: ("string", None), + FieldType.DATE: ("string", "date"), + FieldType.NUMBER: ("number", None), + FieldType.BOOLEAN: ("boolean", None), +} + + +def _field_to_column(field: FieldDefinition) -> ColumnDef: + """Translate a FieldDefinition into a ColumnDef for the metadata table.""" + json_type, fmt = _JSON_TYPE_MAP.get(field.type, (None, None)) + if json_type is None: + raise ValidationError( + f"Field {field.name!r} has unrepresentable type {field.type!r}. " + "Add a column-mapper entry for this FieldType before using it in a schema.", + field=field.name, + ) + return ColumnDef( + name=field.name, + json_type=json_type, + format=fmt, + required=field.required, + ) + + +class PostgresMetadataStore(MetadataStore): + """DDL + DML for per-schema typed metadata tables.""" + + def __init__(self, engine: AsyncEngine, session: AsyncSession) -> None: + self._engine = engine + self._session = session + + async def ensure_table( + self, + schema_id: SchemaId, + fields: list[FieldDefinition], + ) -> None: + id_str = schema_id.id.root + major = schema_id.major + try: + slug = schema_slug(id_str) + except ValueError as exc: + raise ValidationError(str(exc), field="schema_id") from exc + pg_table = f"{slug}_v{major}" + try: + check_pg_table_name(pg_table) + except ValueError as exc: + raise ValidationError(str(exc), field="schema_id") from exc + + columns = [_field_to_column(f) for f in fields] + metadata_schema = MetadataSchema(columns=columns) + + async with self._engine.begin() as conn: + # Note: the ``metadata`` PG schema is created by migration + # ``076_add_metadata_schema_and_catalog`` and is a precondition + # for this store. We don't run ``CREATE SCHEMA IF NOT EXISTS`` + # here because it races on ``pg_namespace`` under concurrency, + # and the migration makes it unnecessary. + + # Serialise concurrent ensure_table() calls for the same + # (schema_id, major) pair. Without this lock, two conventions + # registering simultaneously both pass the "does it exist?" + # check and race on CREATE TABLE, causing the loser to fail + # with DuplicateTable. The advisory lock is released at + # transaction commit. + await conn.execute( + text("SELECT pg_advisory_xact_lock(hashtextextended(:key, 0))"), + {"key": f"{id_str}@v{major}"}, + ) + + existing = ( + ( + await conn.execute( + select(metadata_tables_table).where( + metadata_tables_table.c.schema_id == id_str, + metadata_tables_table.c.schema_major == major, + ) + ) + ) + .mappings() + .first() + ) + + if existing is None: + table = build_metadata_table(pg_table, metadata_schema) + await conn.run_sync(table.metadata.create_all, checkfirst=False) + now = datetime.now(UTC) + await conn.execute( + metadata_tables_table.insert().values( + schema_id=id_str, + schema_slug=slug, + schema_major=major, + schema_versions=[schema_id.render()], + pg_table=pg_table, + metadata_schema=metadata_schema.model_dump(), + created_at=now, + updated_at=now, + ) + ) + return + + # Table exists — possibly evolve. + stored_schema = MetadataSchema.model_validate(existing["metadata_schema"]) + stored_versions: list[str] = list(existing["schema_versions"]) + pg_table = existing["pg_table"] + + _validate_additive(stored_schema.columns, columns) + + new_columns = [ + c for c in columns if c.name not in {s.name for s in stored_schema.columns} + ] + rendered = schema_id.render() + if not new_columns: + if rendered not in stored_versions: + stored_versions.append(rendered) + await conn.execute( + metadata_tables_table.update() + .where(metadata_tables_table.c.id == existing["id"]) + .values( + schema_versions=stored_versions, + updated_at=datetime.now(UTC), + ) + ) + return + + # Apply ALTER ADD COLUMN for each new column + for col_def in new_columns: + await conn.execute(text(_alter_add_column_stmt(pg_table, col_def))) + + merged_columns = stored_schema.columns + new_columns + if rendered not in stored_versions: + stored_versions.append(rendered) + await conn.execute( + metadata_tables_table.update() + .where(metadata_tables_table.c.id == existing["id"]) + .values( + metadata_schema=MetadataSchema(columns=merged_columns).model_dump(), + schema_versions=stored_versions, + updated_at=datetime.now(UTC), + ) + ) + + async def insert( + self, + schema_id: SchemaId, + record_srn: RecordSRN, + values: dict[str, Any], + ) -> None: + await self.insert_many(schema_id, [(record_srn, values)]) + + async def insert_many( + self, + schema_id: SchemaId, + rows: list[tuple[RecordSRN, dict[str, Any]]], + ) -> None: + if not rows: + return + + id_str = schema_id.id.root + major = schema_id.major + + catalog_row = ( + ( + await self._session.execute( + select(metadata_tables_table).where( + metadata_tables_table.c.schema_id == id_str, + metadata_tables_table.c.schema_major == major, + ) + ) + ) + .mappings() + .first() + ) + + if catalog_row is None: + raise ValidationError( + f"No metadata table registered for schema {schema_id.render()} " + f"(id={id_str}, major={major}). " + "Ensure the convention has been registered first.", + field="schema_id", + ) + + schema = MetadataSchema.model_validate(catalog_row["metadata_schema"]) + pg_table = catalog_row["pg_table"] + table = build_metadata_table(pg_table, schema) + + col_by_name = {c.name: c for c in schema.columns} + known_names = set(col_by_name.keys()) + + payloads: list[dict[str, Any]] = [] + for record_srn, values in rows: + payload: dict[str, Any] = {} + for k, v in values.items(): + col = col_by_name.get(k) + if col is None: + continue + payload[k] = _coerce_value(col, v, record_srn=str(record_srn)) + payload["record_srn"] = str(record_srn) + payloads.append(payload) + + # Uniform column set across all rows — asyncpg multi-row insert requires it. + # Fill missing columns with None so every payload has the same keys. + all_keys: set[str] = {"record_srn"} | known_names + for p in payloads: + for k in all_keys: + p.setdefault(k, None) + + stmt = insert(table).values(payloads) + update_cols = {c: stmt.excluded[c] for c in all_keys if c != "record_srn"} + if update_cols: + stmt = stmt.on_conflict_do_update( + index_elements=[table.c.record_srn], + set_=update_cols, + ) + else: + stmt = stmt.on_conflict_do_nothing(index_elements=[table.c.record_srn]) + await self._session.execute(stmt) + await self._session.flush() + + +def _validate_additive(existing: Sequence[ColumnDef], incoming: Sequence[ColumnDef]) -> None: + """Raise ValidationError if the incoming column set is not additive.""" + by_name = {c.name: c for c in existing} + for col in incoming: + if col.name not in by_name: + if col.required: + raise ValidationError( + f"Non-additive evolution: new field {col.name!r} is required. " + "New fields in minor/patch bumps must be optional.", + field=col.name, + ) + continue + prev = by_name[col.name] + if prev.json_type != col.json_type or prev.format != col.format: + raise ValidationError( + f"Non-additive evolution: field {col.name!r} changed type " + f"({prev.json_type}/{prev.format} → {col.json_type}/{col.format}).", + field=col.name, + ) + if prev.required is False and col.required is True: + raise ValidationError( + f"Non-additive evolution: field {col.name!r} tightened to required.", + field=col.name, + ) + incoming_names = {c.name for c in incoming} + for prev_name in by_name.keys(): + if prev_name not in incoming_names: + raise ValidationError( + f"Non-additive evolution: field {prev_name!r} was removed.", + field=prev_name, + ) + + +def _alter_add_column_stmt(pg_table: str, col_def: ColumnDef) -> str: + """SQL string to ALTER TABLE ADD COLUMN for a single column definition. + + Both ``pg_table`` and ``col_def.name`` are interpolated into raw SQL, so + they are strictly validated against the PG identifier regex first — any + attempt to smuggle a ``"`` through would otherwise break the quoting and + inject arbitrary DDL. + """ + sql_type = _column_type_sql(map_column(col_def).type) + null_sql = "" if not col_def.required else " NOT NULL" + safe_table = _safe_ident(pg_table) + safe_col = _safe_ident(col_def.name) + return ( + f'ALTER TABLE "{metadata_pg_schema()}"."{safe_table}" ' + f'ADD COLUMN IF NOT EXISTS "{safe_col}" {sql_type}{null_sql}' + ) + + +def _coerce_value(col: ColumnDef, value: Any, *, record_srn: str | None = None) -> Any: + """Coerce a JSONB-read value to match its typed PG column. + + ``records.metadata`` is JSONB, so date/datetime fields come back as ISO + strings. asyncpg won't auto-parse those for DATE / TIMESTAMP columns — + we parse here based on the declared column format. + + Malformed ISO strings are re-raised as ``ValidationError`` so the API + surfaces them as 400 with field context, not a bare 500. + """ + if value is None: + return None + if col.json_type == "string" and col.format == "date": + if isinstance(value, date): + return value + try: + return date.fromisoformat(value) + except (TypeError, ValueError) as exc: + raise ValidationError( + f"Field {col.name!r} expects an ISO-8601 date, got {value!r}" + + (f" (record {record_srn})" if record_srn else ""), + field=col.name, + ) from exc + if col.json_type == "string" and col.format == "date-time": + if isinstance(value, datetime): + return value + try: + return datetime.fromisoformat(value) + except (TypeError, ValueError) as exc: + raise ValidationError( + f"Field {col.name!r} expects an ISO-8601 date-time, got {value!r}" + + (f" (record {record_srn})" if record_srn else ""), + field=col.name, + ) from exc + return value + + +def _column_type_sql(sa_type: Any) -> str: + if isinstance(sa_type, sa.Text): + return "text" + if isinstance(sa_type, sa.DateTime): + return "timestamp with time zone" if sa_type.timezone else "timestamp" + if isinstance(sa_type, sa.Date): + return "date" + if isinstance(sa_type, sa.Uuid): + return "uuid" + if isinstance(sa_type, sa.Float): + return "double precision" + if isinstance(sa_type, sa.BigInteger): + return "bigint" + if isinstance(sa_type, sa.Boolean): + return "boolean" + return "jsonb" diff --git a/server/osa/infrastructure/persistence/metadata_table.py b/server/osa/infrastructure/persistence/metadata_table.py new file mode 100644 index 0000000..5ae0695 --- /dev/null +++ b/server/osa/infrastructure/persistence/metadata_table.py @@ -0,0 +1,117 @@ +"""Shared helpers for building dynamic metadata Table objects. + +Mirrors :mod:`osa.infrastructure.persistence.feature_table` — metadata tables +are schema-keyed typed stores living in the ``metadata`` PG schema, with a +catalog row in ``public.metadata_tables`` per (schema_identity, major) pair. +""" + +from __future__ import annotations + +import re + +import sqlalchemy as sa + +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.value import ValueObject +from osa.infrastructure.persistence.api_naming import metadata_pg_schema +from osa.infrastructure.persistence.column_mapper import map_column +from osa.infrastructure.persistence.tables import records_table + +# Back-compat re-export for callers that import the constant directly. +# Prefer ``metadata_pg_schema()`` in new code. +METADATA_SCHEMA = metadata_pg_schema() + +AUTO_COLUMN_NAMES = frozenset({"id", "record_srn", "created_at"}) + +# PG identifier limit under default ``NAMEDATALEN`` (64). Identifiers over +# this are silently truncated by PG, which would cause catalog/table name +# drift — surface the limit as a hard check instead. +PG_IDENT_MAX_LEN = 63 + +# Upper bound for a derived slug — matches :class:`SchemaIdentifier` (3-64). +# The final table name is ``f"{slug}_v{major}"``; that total length is +# checked separately by :func:`check_pg_table_name` at the boundary where +# ``major`` is known. +_SLUG_RE = re.compile(r"^[a-z][a-z0-9_]{2,63}$") + + +class MetadataSchema(ValueObject): + """Typed representation of the ``metadata_tables.metadata_schema`` JSON column.""" + + columns: list[ColumnDef] = [] + + +def schema_slug(title: str) -> str: + """Derive a pg-safe slug from a Schema title. + + Lowercases, replaces runs of non-alphanumerics with a single underscore, + strips leading/trailing underscores, then validates against + ``^[a-z][a-z0-9_]{2,63}$`` (3-64 chars, matching + :class:`SchemaIdentifier`). Raises ``ValueError`` if the derived slug is + empty or cannot be validated. + + Callers that combine the slug with a suffix (e.g. ``_v{major}``) must + separately check the combined length against :data:`PG_IDENT_MAX_LEN`. + """ + normalised = re.sub(r"[^a-z0-9]+", "_", title.strip().lower()).strip("_") + if not normalised or not _SLUG_RE.match(normalised): + raise ValueError( + f"Cannot derive a valid metadata table slug from title {title!r}. " + "Expected a string that maps to ^[a-z][a-z0-9_]{2,63}$." + ) + return normalised + + +def check_pg_table_name(pg_table: str) -> None: + """Raise ``ValueError`` if *pg_table* exceeds the PG identifier limit. + + Without this, PG silently truncates long identifiers at 63 chars, which + would desynchronise the catalog (``metadata_tables.pg_table``) from the + actual table name. + """ + if len(pg_table) > PG_IDENT_MAX_LEN: + raise ValueError( + f"Derived PG table name {pg_table!r} is {len(pg_table)} chars, " + f"exceeds PG's {PG_IDENT_MAX_LEN}-char identifier limit. " + "Use a shorter schema id." + ) + + +def build_metadata_table(pg_table: str, schema: MetadataSchema) -> sa.Table: + """Build a SQLAlchemy ``Table`` for a dynamic metadata table. + + Adds auto columns (``id``, ``record_srn``, ``created_at``) plus data columns + derived from *schema*. ``record_srn`` is ``UNIQUE`` (exactly one metadata + row per record) and carries an ``ON DELETE CASCADE`` FK to ``records.srn``. + The FK target is the ``Column`` object itself, so SQLAlchemy resolves it + without requiring ``records`` to live in the same disposable ``MetaData`` + as the dynamic table. + """ + data_columns = [map_column(col_def) for col_def in schema.columns] + + metadata_obj = sa.MetaData() + return sa.Table( + pg_table, + metadata_obj, + sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True), + sa.Column( + "record_srn", + sa.Text, + sa.ForeignKey(records_table.c.srn, ondelete="CASCADE"), + nullable=False, + unique=True, + ), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + *data_columns, + schema=metadata_pg_schema(), + ) + + +def data_columns(table: sa.Table) -> list[sa.Column]: + """Return only the user-defined data columns, excluding auto columns.""" + return [c for c in table.columns if c.key not in AUTO_COLUMN_NAMES] diff --git a/server/osa/infrastructure/persistence/repository/convention.py b/server/osa/infrastructure/persistence/repository/convention.py index 626f4cc..c2cd050 100644 --- a/server/osa/infrastructure/persistence/repository/convention.py +++ b/server/osa/infrastructure/persistence/repository/convention.py @@ -8,7 +8,7 @@ from osa.domain.deposition.port.convention_repository import ConventionRepository from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, LocalId, SchemaId, Semver from osa.infrastructure.persistence.tables import conventions_table @@ -17,7 +17,8 @@ def _convention_to_row(convention: Convention) -> dict[str, Any]: "srn": str(convention.srn), "title": convention.title, "description": convention.description, - "schema_srn": str(convention.schema_srn), + "schema_id": convention.schema_id.id.root, + "schema_version": convention.schema_id.version.root, "file_requirements": convention.file_requirements.model_dump(), "hooks": [h.model_dump() for h in convention.hooks], "source": convention.ingester.model_dump() if convention.ingester else None, @@ -31,7 +32,10 @@ def _row_to_convention(row: dict[str, Any]) -> Convention: srn=ConventionSRN.parse(row["srn"]), title=row["title"], description=row.get("description"), - schema_srn=SchemaSRN.parse(row["schema_srn"]), + schema_id=SchemaId( + id=LocalId(row["schema_id"]), + version=Semver.from_string(row["schema_version"]), + ), file_requirements=FileRequirements.model_validate(row["file_requirements"]), hooks=[HookDefinition.model_validate(h) for h in (row.get("hooks") or [])], ingester=IngesterDefinition.model_validate(source_data) if source_data else None, diff --git a/server/osa/infrastructure/persistence/repository/schema.py b/server/osa/infrastructure/persistence/repository/schema.py index 135fc10..42d911a 100644 --- a/server/osa/infrastructure/persistence/repository/schema.py +++ b/server/osa/infrastructure/persistence/repository/schema.py @@ -1,18 +1,19 @@ from typing import Any, List -from sqlalchemy import insert, select +from sqlalchemy import and_, insert, select from sqlalchemy.ext.asyncio import AsyncSession from osa.domain.semantics.model.schema import Schema from osa.domain.semantics.model.value import FieldDefinition from osa.domain.semantics.port.schema_repository import SchemaRepository -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import LocalId, SchemaId, Semver from osa.infrastructure.persistence.tables import schemas_table def _schema_to_row(schema: Schema) -> dict[str, Any]: return { - "srn": str(schema.srn), + "id": schema.id.id.root, + "version": schema.id.version.root, "title": schema.title, "fields": [f.model_dump(mode="json") for f in schema.fields], "created_at": schema.created_at, @@ -22,13 +23,20 @@ def _schema_to_row(schema: Schema) -> dict[str, Any]: def _row_to_schema(row: dict[str, Any]) -> Schema: fields = [FieldDefinition.model_validate(f) for f in row["fields"]] return Schema( - srn=SchemaSRN.parse(row["srn"]), + id=SchemaId(id=LocalId(row["id"]), version=Semver.from_string(row["version"])), title=row["title"], fields=fields, created_at=row["created_at"], ) +def _where_schema_id(schema_id: SchemaId) -> Any: + return and_( + schemas_table.c.id == schema_id.id.root, + schemas_table.c.version == schema_id.version.root, + ) + + class PostgresSemanticsSchemaRepository(SchemaRepository): def __init__(self, session: AsyncSession) -> None: self.session = session @@ -38,8 +46,8 @@ async def save(self, schema: Schema) -> None: await self.session.execute(insert(schemas_table).values(**row)) await self.session.flush() - async def get(self, srn: SchemaSRN) -> Schema | None: - stmt = select(schemas_table).where(schemas_table.c.srn == str(srn)) + async def get(self, schema_id: SchemaId) -> Schema | None: + stmt = select(schemas_table).where(_where_schema_id(schema_id)) result = await self.session.execute(stmt) row = result.mappings().first() return _row_to_schema(dict(row)) if row else None @@ -54,7 +62,7 @@ async def list(self, *, limit: int | None = None, offset: int | None = None) -> result = await self.session.execute(stmt) return [_row_to_schema(dict(r)) for r in result.mappings().all()] - async def exists(self, srn: SchemaSRN) -> bool: - stmt = select(schemas_table.c.srn).where(schemas_table.c.srn == str(srn)) + async def exists(self, schema_id: SchemaId) -> bool: + stmt = select(schemas_table.c.id).where(_where_schema_id(schema_id)) result = await self.session.execute(stmt) return result.first() is not None diff --git a/server/osa/infrastructure/persistence/tables.py b/server/osa/infrastructure/persistence/tables.py index 315f97c..b8b8d22 100644 --- a/server/osa/infrastructure/persistence/tables.py +++ b/server/osa/infrastructure/persistence/tables.py @@ -66,12 +66,15 @@ metadata, Column("srn", String, primary_key=True), Column("convention_srn", Text, nullable=False), + Column("schema_id", Text, nullable=False), + Column("schema_version", Text, nullable=False), Column("source", JSONB, nullable=False), Column("metadata", JSONB, nullable=False), Column("published_at", DateTime(timezone=True), nullable=False), ) Index("idx_records_convention_srn", records_table.c.convention_srn) +Index("idx_records_schema_id", records_table.c.schema_id) Index( "uq_records_source", records_table.c.source["type"].as_string(), @@ -255,12 +258,15 @@ schemas_table = Table( "schemas", metadata, - Column("srn", String, primary_key=True), # Versioned SRN string + Column("id", String, primary_key=True, nullable=False), + Column("version", String, primary_key=True, nullable=False), Column("title", String(255), nullable=False), Column("fields", JSON, nullable=False), # List of FieldDefinition dicts Column("created_at", DateTime(timezone=True), nullable=False), ) +Index("idx_schemas_id", schemas_table.c.id) + # ============================================================================ # CONVENTIONS TABLE (Deposition) @@ -268,10 +274,11 @@ conventions_table = Table( "conventions", metadata, - Column("srn", String, primary_key=True), # Versioned SRN string + Column("srn", String, primary_key=True), # Convention SRN stays as-is (published artifact) Column("title", String(255), nullable=False), Column("description", Text, nullable=True), - Column("schema_srn", String, nullable=False), # Reference to schemas.srn + Column("schema_id", String, nullable=False), + Column("schema_version", String, nullable=False), Column("file_requirements", JSON, nullable=False), # FileRequirements as dict Column("hooks", JSON, nullable=False, default=[]), # List of HookDefinition dicts Column("source", JSON, nullable=True), # IngesterDefinition as dict @@ -295,6 +302,26 @@ ) +# ============================================================================ +# METADATA TABLES CATALOG (Typed Metadata — feature 076) +# ============================================================================ +metadata_tables_table = Table( + "metadata_tables", + metadata, + Column("id", Integer, primary_key=True, autoincrement=True), + Column("schema_id", Text, nullable=False), + Column("schema_slug", Text, nullable=False), + Column("schema_major", Integer, nullable=False), + Column("schema_versions", JSONB, nullable=False), + Column("pg_table", Text, nullable=False), + Column("metadata_schema", JSONB, nullable=False), + Column("created_at", DateTime(timezone=True), nullable=False), + Column("updated_at", DateTime(timezone=True), nullable=False), + UniqueConstraint("schema_id", "schema_major", name="uq_metadata_tables_id_major"), + UniqueConstraint("pg_table", name="uq_metadata_tables_pg_table"), +) + + # ============================================================================ # ROLE ASSIGNMENTS TABLE (Authorization) # ============================================================================ diff --git a/server/tests/integration/conftest.py b/server/tests/integration/conftest.py index a784e7a..2ee9a02 100644 --- a/server/tests/integration/conftest.py +++ b/server/tests/integration/conftest.py @@ -1,6 +1,9 @@ """Fixtures for PostgreSQL integration tests.""" +import json import os +from datetime import UTC, datetime +from typing import Any import pytest import pytest_asyncio @@ -21,6 +24,45 @@ def _get_pg_url() -> str: return url +async def seed_record( + engine: AsyncEngine, + *, + srn: str, + convention_srn: str = "urn:osa:localhost:conv:test@1.0.0", + schema_id: str = "test", + schema_version: str = "1.0.0", + source: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + published_at: datetime | None = None, +) -> None: + """Insert a records row directly so typed-table FK inserts succeed. + + Keeps tests independent of the full publish event chain when they only + need a persisted Record to anchor metadata/feature rows against. + """ + src = source or {"type": "deposition", "id": f"dep-{srn.split(':')[-1]}"} + async with engine.begin() as conn: + await conn.execute( + text( + """ + INSERT INTO records (srn, convention_srn, schema_id, schema_version, + source, metadata, published_at) + VALUES (:srn, :conv, :schema_id, :schema_version, + CAST(:source AS JSONB), CAST(:meta AS JSONB), :published_at) + """ + ), + { + "srn": srn, + "conv": convention_srn, + "schema_id": schema_id, + "schema_version": schema_version, + "source": json.dumps(src), + "meta": json.dumps(metadata or {}), + "published_at": published_at or datetime.now(UTC), + }, + ) + + @pytest_asyncio.fixture async def pg_engine(): """Per-test async engine pointing at osa_test.""" @@ -39,16 +81,26 @@ async def pg_session(pg_engine: AsyncEngine): yield session await session.rollback() - # Truncate all tables after each test + # Truncate static tables + drop the two schemas that hold runtime-created + # dynamic tables (features., metadata._v). Without the + # drop, a dynamic table created by test A survives TRUNCATE and collides + # when test B tries to ensure/create it again. async with pg_engine.begin() as conn: await conn.execute( text( "TRUNCATE TABLE depositions, conventions, schemas, ontologies, " "ontology_terms, events, deliveries, records, validation_runs, " - "feature_tables, users, identities, refresh_tokens, " + "feature_tables, metadata_tables, users, identities, refresh_tokens, " "role_assignments CASCADE" ) ) + await conn.execute(text('DROP SCHEMA IF EXISTS "features" CASCADE')) + await conn.execute(text('DROP SCHEMA IF EXISTS "metadata" CASCADE')) + # Re-create empty ``metadata`` schema. Production relies on the + # migration having created it; tests need to restore that + # invariant after the DROP above. + await conn.execute(text('CREATE SCHEMA "metadata"')) + await conn.execute(text('CREATE SCHEMA "features"')) # Re-seed system user after truncate await ensure_system_user(pg_engine) diff --git a/server/tests/integration/persistence/test_convention_repo.py b/server/tests/integration/persistence/test_convention_repo.py index 632eb79..0e5e818 100644 --- a/server/tests/integration/persistence/test_convention_repo.py +++ b/server/tests/integration/persistence/test_convention_repo.py @@ -20,7 +20,7 @@ IngesterScheduleConfig, InitialRunConfig, ) -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.infrastructure.persistence.repository.convention import ( PostgresConventionRepository, ) @@ -30,7 +30,7 @@ def _make_convention( *, srn: str = "urn:osa:localhost:conv:test-convention-001@1.0.0", title: str = "Test Convention", - schema_srn: str = "urn:osa:localhost:schema:test-schema-001@1.0.0", + schema_id: str = "test-schema-001@1.0.0", hooks: list[HookDefinition] | None = None, ingester: IngesterDefinition | None = None, ) -> Convention: @@ -38,7 +38,7 @@ def _make_convention( srn=ConventionSRN.parse(srn), title=title, description="A test convention for integration tests", - schema_srn=SchemaSRN.parse(schema_srn), + schema_id=SchemaId.parse(schema_id), file_requirements=FileRequirements( accepted_types=[".csv", ".h5ad"], min_count=1, @@ -99,7 +99,7 @@ async def test_save_and_get(self, pg_session: AsyncSession): assert str(got.srn) == str(conv.srn) assert got.title == conv.title assert got.description == conv.description - assert str(got.schema_srn) == str(conv.schema_srn) + assert str(got.schema_id) == str(conv.schema_id) assert got.file_requirements == conv.file_requirements assert len(got.hooks) == 1 assert got.hooks[0].runtime.image == hook.runtime.image diff --git a/server/tests/integration/persistence/test_discovery_pagination.py b/server/tests/integration/persistence/test_discovery_pagination.py new file mode 100644 index 0000000..49b2fb8 --- /dev/null +++ b/server/tests/integration/persistence/test_discovery_pagination.py @@ -0,0 +1,80 @@ +"""Integration tests for discovery keyset pagination against real Postgres. + +Regression coverage for a production bug where paginating past page 1 raised +``operator does not exist: timestamp with time zone < character varying`` +because the cursor's sort value round-tripped through JSON as a plain string +and was bound as ``VARCHAR`` against the typed ``records.published_at`` column. +""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from osa.domain.discovery.model.value import SortOrder, decode_cursor, encode_cursor +from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore +from osa.infrastructure.persistence.tables import records_table + + +async def _insert_record(session: AsyncSession, srn: str, published_at: datetime) -> None: + await session.execute( + records_table.insert().values( + srn=srn, + convention_srn="urn:osa:localhost:conv:test@1.0.0", + schema_id="test", + schema_version="1.0.0", + source={"type": "test", "id": srn}, + metadata={}, + published_at=published_at, + ) + ) + await session.commit() + + +@pytest.mark.asyncio +class TestDiscoveryPaginationPublishedAt: + async def test_second_page_with_published_at_cursor(self, pg_session: AsyncSession) -> None: + """Fetching page 2 with a cursor must not trip the timestamptz/varchar + mismatch — the bug manifested only on requests that supplied a cursor.""" + store = PostgresDiscoveryReadStore(pg_session) + base = datetime(2026, 4, 7, 9, 0, 0, tzinfo=UTC) + records = [(f"urn:osa:localhost:rec:page-{i}@1", base.replace(second=i)) for i in range(3)] + for srn, ts in records: + await _insert_record(pg_session, srn, ts) + + first_page = await store.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=2, + ) + assert len(first_page) == 2 + + # Encode + decode the cursor the same way the service does — this is the + # round-trip that previously produced a VARCHAR bind. + last = first_page[-1] + cursor_str = encode_cursor(last.published_at.isoformat(), str(last.srn)) + decoded = decode_cursor(cursor_str) + + second_page = await store.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=decoded, + limit=2, + ) + + assert len(second_page) == 1 + returned = {str(r.srn) for r in first_page} | {str(r.srn) for r in second_page} + assert returned == {srn for srn, _ in records} diff --git a/server/tests/integration/persistence/test_feature_store.py b/server/tests/integration/persistence/test_feature_store.py index 0c81bc8..a05b28e 100644 --- a/server/tests/integration/persistence/test_feature_store.py +++ b/server/tests/integration/persistence/test_feature_store.py @@ -111,16 +111,21 @@ async def test_create_table_registers_in_catalog( @pytest.mark.asyncio class TestFeatureStoreInsert: async def test_insert_features(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + from tests.integration.conftest import seed_record + store = PostgresFeatureStore(pg_engine, pg_session) hook = _make_hook(name="insert_hook") await store.create_table("insert_hook", hook.feature.columns) + record_srn = "urn:osa:localhost:rec:rec-001@1" + await seed_record(pg_engine, srn=record_srn) + rows = [ {"score": 0.95, "label": "good"}, {"score": 0.42, "label": "poor"}, {"score": 0.78, "label": None}, ] - count = await store.insert_features("insert_hook", "urn:osa:localhost:rec:rec-001@1", rows) + count = await store.insert_features("insert_hook", record_srn, rows) assert count == 3 # Verify data is in the table @@ -171,6 +176,11 @@ async def test_jsonb_column_for_array_and_object( assert col_types["metadata"] == "jsonb" assert col_types["count"] == "bigint" + from tests.integration.conftest import seed_record + + record_srn = "urn:osa:localhost:rec:rec-jsonb@1" + await seed_record(pg_engine, srn=record_srn) + # Insert data with JSONB values rows = [ { @@ -179,5 +189,5 @@ async def test_jsonb_column_for_array_and_object( "count": 42, } ] - count = await store.insert_features("jsonb_hook", "urn:osa:localhost:rec:rec-jsonb@1", rows) + count = await store.insert_features("jsonb_hook", record_srn, rows) assert count == 1 diff --git a/server/tests/integration/persistence/test_metadata_store.py b/server/tests/integration/persistence/test_metadata_store.py new file mode 100644 index 0000000..d416213 --- /dev/null +++ b/server/tests/integration/persistence/test_metadata_store.py @@ -0,0 +1,507 @@ +"""Integration tests for PostgresMetadataStore — DDL, UPSERT, FK cascade, additive evolution.""" + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA + +from tests.integration.conftest import seed_record + +SCHEMA_ID = "bio-sample" +SCHEMA_V1 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") +SCHEMA_V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") +SCHEMA_V2 = SchemaId.parse(f"{SCHEMA_ID}@2.0.0") + + +def _fields_v1() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _fields_v11_additive() -> list[FieldDefinition]: + return _fields_v1() + [ + FieldDefinition( + name="collection_site", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _fields_rename() -> list[FieldDefinition]: + # 'species' renamed to 'organism' — not additive. + return [ + FieldDefinition( + name="organism", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +async def _table_exists(engine: AsyncEngine, pg_table: str) -> bool: + async with engine.begin() as conn: + result = await conn.execute( + text( + "SELECT EXISTS (SELECT 1 FROM information_schema.tables " + "WHERE table_schema = :s AND table_name = :t)" + ), + {"s": METADATA_SCHEMA, "t": pg_table}, + ) + return bool(result.scalar()) + + +async def _column_names(engine: AsyncEngine, pg_table: str) -> list[str]: + async with engine.begin() as conn: + result = await conn.execute( + text( + "SELECT column_name FROM information_schema.columns " + "WHERE table_schema = :s AND table_name = :t " + "ORDER BY ordinal_position" + ), + {"s": METADATA_SCHEMA, "t": pg_table}, + ) + return [row[0] for row in result.fetchall()] + + +@pytest.mark.asyncio +class TestEnsureTable: + async def test_creates_table_and_catalog_row( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + assert await _table_exists(pg_engine, "bio_sample_v1") + cols = await _column_names(pg_engine, "bio_sample_v1") + for expected in ("id", "record_srn", "created_at", "species", "resolution"): + assert expected in cols + + async with pg_engine.begin() as conn: + row = ( + await conn.execute( + text( + "SELECT schema_id, schema_major, pg_table, schema_versions " + "FROM metadata_tables WHERE schema_id = :id" + ), + {"id": SCHEMA_ID}, + ) + ).first() + assert row is not None + assert row[0] == SCHEMA_ID + assert row[1] == 1 + assert row[2] == "bio_sample_v1" + assert str(SCHEMA_V1) in row[3] + + async def test_idempotent_on_same_version( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + # Second call with same SRN should not raise and should not duplicate catalog rows. + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text("SELECT COUNT(*) FROM metadata_tables WHERE schema_id = :id"), + {"id": SCHEMA_ID}, + ) + ).scalar() + assert count == 1 + + async def test_foreign_key_cascade_on_record_srn( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + async with pg_engine.begin() as conn: + constraint = ( + await conn.execute( + text( + "SELECT confdeltype FROM pg_constraint " + "WHERE conrelid = 'metadata.bio_sample_v1'::regclass " + "AND contype = 'f'" + ) + ) + ).scalar() + # 'c' = CASCADE in pg_constraint.confdeltype. asyncpg returns the + # Postgres "char" type as bytes; normalize for comparison. + if isinstance(constraint, bytes): + constraint = constraint.decode() + assert constraint == "c" + + +@pytest.mark.asyncio +class TestDdlInjectionGuard: + """Defense-in-depth: raw DDL interpolation must refuse bad identifiers.""" + + async def test_ddl_injection_in_field_name_rejected( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """A field name with a quote or injection payload must never reach + the ALTER TABLE SQL. ``_safe_ident`` rejects at the DDL boundary.""" + from osa.infrastructure.persistence.metadata_store import _safe_ident + + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident('species"; DROP TABLE records; --') + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident("has-hyphen") + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident("1starts_with_digit") + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident("") + # Valid identifiers pass through. + assert _safe_ident("species") == "species" + assert _safe_ident("bio_sample_v1") == "bio_sample_v1" + + +@pytest.mark.asyncio +class TestConcurrentEnsureTable: + """TOCTOU defense: two ensure_table calls for the same schema must not + both try to CREATE TABLE. The advisory lock serialises them; one wins, + the other sees the catalog row and no-ops.""" + + async def test_concurrent_ensure_table_does_not_raise( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + import asyncio + + store_a = PostgresMetadataStore(pg_engine, pg_session) + store_b = PostgresMetadataStore(pg_engine, pg_session) + # Run both concurrently. Without the advisory lock, the second + # would either race on SELECT and raise DuplicateTable on CREATE, + # or raise on the catalog INSERT unique violation. + await asyncio.gather( + store_a.ensure_table(SCHEMA_V1, _fields_v1()), + store_b.ensure_table(SCHEMA_V1, _fields_v1()), + ) + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text("SELECT COUNT(*) FROM metadata_tables WHERE schema_id = :id"), + {"id": SCHEMA_ID}, + ) + ).scalar() + assert count == 1 + + +@pytest.mark.asyncio +class TestInsert: + async def test_insert_typed_row(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + + await store.insert( + SCHEMA_V1, + record_srn, + {"species": "Homo sapiens", "resolution": 3.5}, + ) + await pg_session.commit() + + async with pg_engine.begin() as conn: + row = ( + await conn.execute( + text( + f"SELECT record_srn, species, resolution " + f'FROM "{METADATA_SCHEMA}"."bio_sample_v1"' + ) + ) + ).first() + assert row is not None + assert row[0] == str(record_srn) + assert row[1] == "Homo sapiens" + assert row[2] == 3.5 + + async def test_insert_is_idempotent_on_duplicate_delivery( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:dup@1") + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + + await store.insert(SCHEMA_V1, record_srn, {"species": "Mus musculus", "resolution": 1.0}) + await store.insert(SCHEMA_V1, record_srn, {"species": "Mus musculus", "resolution": 1.0}) + await pg_session.commit() + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') + ) + ).scalar() + assert count == 1 + + async def test_insert_many_bulk_upserts_rows( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + record_srns = [RecordSRN.parse(f"urn:osa:localhost:rec:bulk-{i}@1") for i in range(5)] + for srn in record_srns: + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + + rows = [ + (srn, {"species": f"species-{i}", "resolution": float(i)}) + for i, srn in enumerate(record_srns) + ] + await store.insert_many(SCHEMA_V1, rows) + await pg_session.commit() + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') + ) + ).scalar() + assert count == 5 + + async def test_insert_many_empty_rows_noop( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + # Must not raise, must not hit the DB + await store.insert_many(SCHEMA_V1, []) + + async def test_insert_many_coerces_dates_per_row( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """Every row in a batch gets type coercion applied independently.""" + from datetime import date + + fields = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="collected_on", + type=FieldType.DATE, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + dated_schema = SchemaId.parse("dated-bulk@1.0.0") + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(dated_schema, fields) + + srns = [RecordSRN.parse(f"urn:osa:localhost:rec:dated-bulk-{i}@1") for i in range(3)] + for srn in srns: + await seed_record( + pg_engine, + srn=str(srn), + schema_id=dated_schema.id.root, + schema_version=dated_schema.version.root, + ) + + rows = [ + (srns[0], {"species": "A", "collected_on": "2026-01-01"}), + (srns[1], {"species": "B", "collected_on": "2026-02-02"}), + (srns[2], {"species": "C", "collected_on": "2026-03-03"}), + ] + await store.insert_many(dated_schema, rows) + await pg_session.commit() + + async with pg_engine.begin() as conn: + result = ( + await conn.execute( + text( + f"SELECT species, collected_on " + f'FROM "{METADATA_SCHEMA}"."dated_bulk_v1" ORDER BY species' + ) + ) + ).all() + assert [(r[0], r[1]) for r in result] == [ + ("A", date(2026, 1, 1)), + ("B", date(2026, 2, 2)), + ("C", date(2026, 3, 3)), + ] + + async def test_insert_coerces_iso_date_string_to_date_column( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """JSONB-stored metadata hands date/datetime values back as ISO strings; + asyncpg won't auto-parse those to DATE / TIMESTAMP. The store must + coerce them based on the declared column format.""" + fields = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="collected_on", + type=FieldType.DATE, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + dated_schema = SchemaId.parse("dated-sample@1.0.0") + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(dated_schema, fields) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:dated@1") + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=dated_schema.id.root, + schema_version=dated_schema.version.root, + ) + + # Value as it would arrive from records.metadata JSONB — a string, + # not a datetime.date. Must not raise. + await store.insert( + dated_schema, + record_srn, + {"species": "Homo sapiens", "collected_on": "2026-04-17"}, + ) + await pg_session.commit() + + async with pg_engine.begin() as conn: + row = ( + await conn.execute( + text(f'SELECT collected_on FROM "{METADATA_SCHEMA}"."dated_sample_v1"') + ) + ).first() + from datetime import date + + assert row is not None + assert row[0] == date(2026, 4, 17) + + async def test_cascade_delete_removes_metadata_row( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:cascade@1") + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + await store.insert(SCHEMA_V1, record_srn, {"species": "Cascade", "resolution": 0.1}) + await pg_session.commit() + + async with pg_engine.begin() as conn: + await conn.execute( + text("DELETE FROM records WHERE srn = :srn"), {"srn": str(record_srn)} + ) + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') + ) + ).scalar() + assert count == 0 + + +@pytest.mark.asyncio +class TestAdditiveEvolution: + async def test_add_column_on_minor_bump(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + cols_before = await _column_names(pg_engine, "bio_sample_v1") + assert "collection_site" not in cols_before + + await store.ensure_table(SCHEMA_V11, _fields_v11_additive()) + cols_after = await _column_names(pg_engine, "bio_sample_v1") + assert "collection_site" in cols_after + + async def test_catalog_lineage_appended(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + await store.ensure_table(SCHEMA_V11, _fields_v11_additive()) + + async with pg_engine.begin() as conn: + versions = ( + await conn.execute( + text( + "SELECT schema_versions FROM metadata_tables " + "WHERE schema_id = :id AND schema_major = 1" + ), + {"id": SCHEMA_ID}, + ) + ).scalar() + assert str(SCHEMA_V1) in versions + assert str(SCHEMA_V11) in versions + + +@pytest.mark.asyncio +class TestNonAdditiveRejection: + async def test_rename_raises(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + with pytest.raises(ValidationError, match="Non-additive"): + await store.ensure_table(SCHEMA_V11, _fields_rename()) + + async def test_required_new_field_raises( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + bad = _fields_v1() + [ + FieldDefinition( + name="must_have", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ) + ] + with pytest.raises(ValidationError, match="required"): + await store.ensure_table(SCHEMA_V11, bad) diff --git a/server/tests/integration/test_bulk_publish_dual_write.py b/server/tests/integration/test_bulk_publish_dual_write.py new file mode 100644 index 0000000..ab11f35 --- /dev/null +++ b/server/tests/integration/test_bulk_publish_dual_write.py @@ -0,0 +1,206 @@ +"""Integration tests for dual-write of records + typed metadata. + +``RecordService.bulk_publish`` and ``RecordService.publish_record`` now write +both the canonical ``records`` row and the typed ``metadata._v`` +row atomically in one transaction. These tests verify: + +- Both rows land on a successful publish. +- A malformed metadata value rolls back the whole transaction — no partial + state where ``records`` has a row but the typed table doesn't. +- ``ConventionService.create_convention`` creates the typed table inline + (no event-handler race window). +""" + +from __future__ import annotations + +from unittest.mock import AsyncMock +from uuid import uuid4 + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.deposition.model.value import FileRequirements +from osa.domain.deposition.service.convention import ConventionService +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.record.model.draft import RecordDraft +from osa.domain.record.service import RecordService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.semantics.service.schema import SchemaService +from osa.domain.shared.model.source import DepositionSource +from osa.domain.shared.model.srn import ConventionSRN, Domain, SchemaIdentifier +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.repository.convention import PostgresConventionRepository +from osa.infrastructure.persistence.repository.ontology import PostgresOntologyRepository +from osa.infrastructure.persistence.repository.record import PostgresRecordRepository +from osa.infrastructure.persistence.repository.schema import PostgresSemanticsSchemaRepository + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +async def _register_convention( + pg_engine: AsyncEngine, + pg_session: AsyncSession, + slug: str = "dual-write-sample", +) -> ConventionService: + metadata_store = PostgresMetadataStore(pg_engine, pg_session) + metadata_service = MetadataService(metadata_store=metadata_store) + schema_service = SchemaService( + schema_repo=PostgresSemanticsSchemaRepository(pg_session), + ontology_repo=PostgresOntologyRepository(pg_session), + node_domain=Domain("localhost"), + ) + convention_service = ConventionService( + convention_repo=PostgresConventionRepository(pg_session), + schema_service=schema_service, + metadata_service=metadata_service, + outbox=AsyncMock(), + node_domain=Domain("localhost"), + ) + await convention_service.create_convention( + id=SchemaIdentifier(slug), + title="Dual Write Sample", + version="1.0.0", + schema=_fields(), + file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), + ) + await pg_session.commit() + return convention_service + + +@pytest.mark.asyncio +class TestConventionCreatesTypedTableInline: + async def test_typed_table_exists_immediately_after_create_convention( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """No event-handler race window — the table exists in the same txn.""" + await _register_convention(pg_engine, pg_session, slug="inline-create") + + async with pg_engine.begin() as conn: + exists = ( + await conn.execute( + text( + "SELECT EXISTS (SELECT 1 FROM information_schema.tables " + "WHERE table_schema = 'metadata' AND table_name = 'inline_create_v1')" + ) + ) + ).scalar() + assert exists is True + + +@pytest.mark.asyncio +class TestBulkPublishDualWrite: + async def test_bulk_publish_writes_both_tables( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + await _register_convention(pg_engine, pg_session, slug="bulk-dual") + + # Fetch the convention SRN to attach drafts to. + async with pg_engine.begin() as conn: + conv_srn_str = ( + await conn.execute(text("SELECT srn FROM conventions LIMIT 1")) + ).scalar() + assert conv_srn_str is not None + + record_service = RecordService( + record_repo=PostgresRecordRepository(pg_session), + convention_repo=PostgresConventionRepository(pg_session), + metadata_service=MetadataService( + metadata_store=PostgresMetadataStore(pg_engine, pg_session), + ), + outbox=AsyncMock(), + node_domain=Domain("localhost"), + feature_reader=AsyncMock(), + ) + + drafts = [ + RecordDraft( + source=DepositionSource(id=f"dep-{uuid4()}"), + metadata={"species": "Homo sapiens", "resolution": 2.0 + i * 0.1}, + convention_srn=ConventionSRN.parse(conv_srn_str), + ) + for i in range(3) + ] + + published = await record_service.bulk_publish(drafts) + await pg_session.commit() + + assert len(published) == 3 + + async with pg_engine.begin() as conn: + records_count = ( + await conn.execute( + text("SELECT COUNT(*) FROM records WHERE schema_id = 'bulk-dual'") + ) + ).scalar() + typed_count = ( + await conn.execute(text('SELECT COUNT(*) FROM "metadata"."bulk_dual_v1"')) + ).scalar() + assert records_count == 3 + assert typed_count == 3 + + async def test_malformed_metadata_rolls_back_everything( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """A type error in the typed write must fail the whole transaction — + no orphan row left in ``records``.""" + await _register_convention(pg_engine, pg_session, slug="rollback-sample") + + async with pg_engine.begin() as conn: + conv_srn_str = ( + await conn.execute( + text("SELECT srn FROM conventions WHERE schema_id = 'rollback-sample'") + ) + ).scalar() + + record_service = RecordService( + record_repo=PostgresRecordRepository(pg_session), + convention_repo=PostgresConventionRepository(pg_session), + metadata_service=MetadataService( + metadata_store=PostgresMetadataStore(pg_engine, pg_session), + ), + outbox=AsyncMock(), + node_domain=Domain("localhost"), + feature_reader=AsyncMock(), + ) + + # 'resolution' expects a NUMBER; pass a non-coercible string. + drafts = [ + RecordDraft( + source=DepositionSource(id=f"dep-{uuid4()}"), + metadata={"species": "A", "resolution": "not-a-number"}, + convention_srn=ConventionSRN.parse(conv_srn_str), + ) + ] + + with pytest.raises(Exception): # noqa: BLE001 — asyncpg DataError or similar + await record_service.bulk_publish(drafts) + await pg_session.commit() + await pg_session.rollback() + + async with pg_engine.begin() as conn: + records_count = ( + await conn.execute( + text("SELECT COUNT(*) FROM records WHERE schema_id = 'rollback-sample'") + ) + ).scalar() + typed_count = ( + await conn.execute(text('SELECT COUNT(*) FROM "metadata"."rollback_sample_v1"')) + ).scalar() + assert records_count == 0 + assert typed_count == 0 diff --git a/server/tests/integration/test_discovery_compound_postgres.py b/server/tests/integration/test_discovery_compound_postgres.py new file mode 100644 index 0000000..f752c0f --- /dev/null +++ b/server/tests/integration/test_discovery_compound_postgres.py @@ -0,0 +1,162 @@ +"""Integration tests for compound OR/NOT discovery filters against real PG.""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.discovery.model.refs import MetadataFieldRef +from osa.domain.discovery.model.value import ( + And, + FilterOperator, + Not, + Or, + Predicate, + SortOrder, +) +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +from tests.integration.conftest import seed_record + +SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") +FIELD_TYPES = {"species": FieldType.TEXT, "resolution": FieldType.NUMBER} + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +@pytest.fixture +async def seeded_store(pg_engine: AsyncEngine, pg_session: AsyncSession) -> PostgresMetadataStore: + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields()) + + repo = PostgresSemanticsSchemaRepository(pg_session) + await repo.save( + Schema(id=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) + ) + + rows = [ + ("rec-a1", "Homo sapiens", 3.5), + ("rec-b1", "Homo sapiens", 1.0), + ("rec-c1", "Mus musculus", 3.5), + ("rec-d1", "Drosophila", 0.5), + ] + for rid, sp, res in rows: + srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + await store.insert(SCHEMA_V1, srn, {"species": sp, "resolution": res}) + + await pg_session.commit() + return store + + +def _pred(field: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=op, value=value) + + +@pytest.mark.asyncio +class TestCompound: + async def test_or_tree(self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_store): + read_store = PostgresDiscoveryReadStore(pg_session) + # species = Homo sapiens OR resolution < 1.0 + tree = Or( + operands=[ + _pred("species", FilterOperator.EQ, "Homo sapiens"), + _pred("resolution", FilterOperator.LT, 1.0), + ] + ) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + # a, b (species match) + d (resolution 0.5) — not c + assert srns == { + "urn:osa:localhost:rec:rec-a1@1", + "urn:osa:localhost:rec:rec-b1@1", + "urn:osa:localhost:rec:rec-d1@1", + } + + async def test_not_tree(self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_store): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Not(operand=_pred("species", FilterOperator.EQ, "Homo sapiens")) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-c1@1", "urn:osa:localhost:rec:rec-d1@1"} + + async def test_nested_and_or( + self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_store + ): + read_store = PostgresDiscoveryReadStore(pg_session) + # resolution >= 3.0 AND (species = Homo sapiens OR species = Mus musculus) + tree = And( + operands=[ + _pred("resolution", FilterOperator.GTE, 3.0), + Or( + operands=[ + _pred("species", FilterOperator.EQ, "Homo sapiens"), + _pred("species", FilterOperator.EQ, "Mus musculus"), + ] + ), + ] + ) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-a1@1", "urn:osa:localhost:rec:rec-c1@1"} diff --git a/server/tests/integration/test_discovery_cross_join_postgres.py b/server/tests/integration/test_discovery_cross_join_postgres.py new file mode 100644 index 0000000..c2ed9ec --- /dev/null +++ b/server/tests/integration/test_discovery_cross_join_postgres.py @@ -0,0 +1,254 @@ +"""Integration tests for cross-domain JOINs between records ⋈ features in discovery.""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef +from osa.domain.discovery.model.value import And, FilterOperator, Not, Predicate, SortOrder +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore +from osa.infrastructure.persistence.feature_store import PostgresFeatureStore +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +from tests.integration.conftest import seed_record + +SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") +FIELD_TYPES = {"species": FieldType.TEXT} + + +def _metadata_fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _feature_columns() -> list[ColumnDef]: + return [ + ColumnDef(name="confidence", json_type="number", required=True), + ] + + +@pytest.fixture +async def seeded_both(pg_engine: AsyncEngine, pg_session: AsyncSession): + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + mstore = PostgresMetadataStore(pg_engine, pg_session) + await mstore.ensure_table(SCHEMA_V1, _metadata_fields()) + + fstore = PostgresFeatureStore(pg_engine, pg_session) + await fstore.create_table("cell_classifier", _feature_columns()) + + repo = PostgresSemanticsSchemaRepository(pg_session) + await repo.save( + Schema( + id=SCHEMA_V1, + title="bio_sample", + fields=_metadata_fields(), + created_at=datetime.now(UTC), + ) + ) + + # r1: Homo sapiens + confidence 0.95 + # r2: Homo sapiens + confidence 0.5 + # r3: Mus musculus + confidence 0.95 + for rid, sp, conf in [ + ("rec-r1", "Homo sapiens", 0.95), + ("rec-r2", "Homo sapiens", 0.5), + ("rec-r3", "Mus musculus", 0.95), + ]: + srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + await mstore.insert(SCHEMA_V1, srn, {"species": sp}) + await fstore.insert_features("cell_classifier", str(srn), [{"confidence": conf}]) + + await pg_session.commit() + return mstore, fstore + + +@pytest.mark.asyncio +class TestCrossDomainJoin: + async def test_joined_intersection( + self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_both + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = And( + operands=[ + Predicate( + field=MetadataFieldRef(field="species"), + op=FilterOperator.EQ, + value="Homo sapiens", + ), + Predicate( + field=FeatureFieldRef(hook="cell_classifier", column="confidence"), + op=FilterOperator.GT, + value=0.9, + ), + ] + ) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-r1@1"} + + async def test_unknown_hook_raises( + self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_both + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Predicate( + field=FeatureFieldRef(hook="does_not_exist", column="anything"), + op=FilterOperator.EQ, + value=1, + ) + with pytest.raises(ValidationError, match="Unknown feature hook"): + await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + + +@pytest.fixture +async def seeded_with_missing_feature_row(pg_engine: AsyncEngine, pg_session: AsyncSession): + """Seed a record with a metadata row but NO feature row, so the outer + join produces NULL feature columns for that record.""" + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + mstore = PostgresMetadataStore(pg_engine, pg_session) + await mstore.ensure_table(SCHEMA_V1, _metadata_fields()) + + fstore = PostgresFeatureStore(pg_engine, pg_session) + await fstore.create_table("cell_classifier", _feature_columns()) + + repo = PostgresSemanticsSchemaRepository(pg_session) + await repo.save( + Schema( + id=SCHEMA_V1, + title="bio_sample", + fields=_metadata_fields(), + created_at=datetime.now(UTC), + ) + ) + + # rec-has-feature: has a feature row with confidence 0.95. + # rec-no-feature: no feature row at all (outer join will produce NULLs). + for rid, sp in [("rec-has-feature", "Homo sapiens"), ("rec-no-feature", "Mus musculus")]: + srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + await mstore.insert(SCHEMA_V1, srn, {"species": sp}) + + has_feature_srn = "urn:osa:localhost:rec:rec-has-feature@1" + await fstore.insert_features("cell_classifier", has_feature_srn, [{"confidence": 0.95}]) + + await pg_session.commit() + + +@pytest.mark.asyncio +class TestOuterJoinNullHandling: + """Records without a feature row must not be silently dropped from NEQ/NOT + predicates on feature columns — the outer join produces NULL, and naive + SQL three-valued logic would exclude them.""" + + async def test_neq_on_feature_column_includes_missing_rows( + self, + pg_engine: AsyncEngine, + pg_session: AsyncSession, + seeded_with_missing_feature_row, + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Predicate( + field=FeatureFieldRef(hook="cell_classifier", column="confidence"), + op=FilterOperator.NEQ, + value=0.95, + ) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + # rec-no-feature has no feature row → confidence is NULL → "!= 0.95" + # should include it. rec-has-feature has confidence 0.95 → excluded. + assert srns == {"urn:osa:localhost:rec:rec-no-feature@1"} + + async def test_not_on_feature_column_includes_missing_rows( + self, + pg_engine: AsyncEngine, + pg_session: AsyncSession, + seeded_with_missing_feature_row, + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Not( + operand=Predicate( + field=FeatureFieldRef(hook="cell_classifier", column="confidence"), + op=FilterOperator.EQ, + value=0.95, + ) + ) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + # Same invariant as NEQ: NOT(confidence = 0.95) must surface the + # record with a missing feature row. + assert srns == {"urn:osa:localhost:rec:rec-no-feature@1"} diff --git a/server/tests/integration/test_discovery_records_typed_and.py b/server/tests/integration/test_discovery_records_typed_and.py new file mode 100644 index 0000000..b9487c5 --- /dev/null +++ b/server/tests/integration/test_discovery_records_typed_and.py @@ -0,0 +1,284 @@ +"""Integration tests for /discovery/records with typed-table AND filters.""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.discovery.model.refs import MetadataFieldRef +from osa.domain.discovery.model.value import And, FilterOperator, Predicate, SortOrder +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.infrastructure.persistence.adapter.discovery import ( + PostgresDiscoveryReadStore, + PostgresFieldDefinitionReader, +) +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +from tests.integration.conftest import seed_record + +SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="method", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +async def _seed_schema_row(session: AsyncSession) -> None: + """Seed the `schemas` row so the discovery field reader can resolve types.""" + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + repo = PostgresSemanticsSchemaRepository(session) + await repo.save( + Schema(id=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) + ) + + +async def _publish( + engine: AsyncEngine, + session: AsyncSession, + store: PostgresMetadataStore, + record_srn: RecordSRN, + species: str, + resolution: float, + method: str, +) -> None: + await seed_record( + engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + metadata={"species": species, "resolution": resolution, "method": method}, + ) + await store.insert( + SCHEMA_V1, + record_srn, + {"species": species, "resolution": resolution, "method": method}, + ) + + +@pytest.mark.asyncio +class TestDiscoveryTypedAnd: + async def test_and_filter_returns_matching_records( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields()) + await _seed_schema_row(pg_session) + + rows = [ + ("rec-r1", "Homo sapiens", 3.5, "cryo-EM"), + ("rec-r2", "Homo sapiens", 1.8, "X-ray"), + ("rec-r3", "Mus musculus", 3.0, "cryo-EM"), + ] + for rid, sp, res, meth in rows: + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1"), + sp, + res, + meth, + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + tree = And( + operands=[ + Predicate( + field=MetadataFieldRef(field="species"), + op=FilterOperator.EQ, + value="Homo sapiens", + ), + Predicate( + field=MetadataFieldRef(field="resolution"), + op=FilterOperator.GTE, + value=2.0, + ), + ] + ) + + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={ + "species": FieldType.TEXT, + "resolution": FieldType.NUMBER, + "method": FieldType.TEXT, + }, + ) + + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-r1@1"} + + async def test_scalar_op_succeeds_on_unindexed_column( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """FR-020: scalar ops must NOT be rejected for lack of index.""" + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields()) + await _seed_schema_row(pg_session) + + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse("urn:osa:localhost:rec:rec-ra@1"), + "Homo sapiens", + 3.5, + "cryo-EM", + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + results = await read_store.search_records( + filter_expr=Predicate( + field=MetadataFieldRef(field="method"), + op=FilterOperator.CONTAINS, + value="cryo", + ), + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={ + "species": FieldType.TEXT, + "resolution": FieldType.NUMBER, + "method": FieldType.TEXT, + }, + ) + assert len(results) == 1 + + +@pytest.mark.asyncio +class TestUnscopedListing: + """Plain listings without a filter return canonical JSONB metadata. + Metadata-filtered queries require a pinned schema — the typed table is + the only filter path.""" + + async def test_unscoped_predicate_filter_raises_without_schema( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """Filtering by a metadata field without schema_id must raise — + the JSONB fallback compile path was removed.""" + from osa.domain.discovery.model.refs import MetadataFieldRef + from osa.domain.discovery.model.value import FilterOperator, Predicate + from osa.domain.shared.error import ValidationError + + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields()) + await _seed_schema_row(pg_session) + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse("urn:osa:localhost:rec:rec-9x1w@1"), + "Homo sapiens", + 3.5, + "cryo-EM", + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + with pytest.raises(ValidationError) as exc: + await read_store.search_records( + filter_expr=Predicate( + field=MetadataFieldRef(field="species"), + op=FilterOperator.EQ, + value="Homo sapiens", + ), + schema_id=None, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={"species": FieldType.TEXT}, + ) + assert exc.value.code == "schema_required_for_metadata_query" + + async def test_unscoped_listing_returns_jsonb_metadata( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields()) + await _seed_schema_row(pg_session) + + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse("urn:osa:localhost:rec:rec-unscoped@1"), + "Homo sapiens", + 3.5, + "cryo-EM", + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + results = await read_store.search_records( + filter_expr=None, + schema_id=None, # deliberately unscoped — exercises the JSONB path + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={}, + ) + assert len(results) == 1 + assert results[0].metadata == { + "species": "Homo sapiens", + "resolution": 3.5, + "method": "cryo-EM", + } + + +@pytest.mark.asyncio +class TestFieldDefinitionReader: + async def test_get_fields_for_schema(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + await _seed_schema_row(pg_session) + await pg_session.commit() + + reader = PostgresFieldDefinitionReader(pg_session) + fields = await reader.get_fields_for_schema(SCHEMA_V1) + assert fields["species"] == FieldType.TEXT + assert fields["resolution"] == FieldType.NUMBER diff --git a/server/tests/integration/test_event_batch_processing.py b/server/tests/integration/test_event_batch_processing.py index 41b026f..e1f6754 100644 --- a/server/tests/integration/test_event_batch_processing.py +++ b/server/tests/integration/test_event_batch_processing.py @@ -64,6 +64,8 @@ def make_record_published( domain=Domain("test.example.com"), id=LocalId(str(uuid4())), ) + from osa.domain.shared.model.srn import SchemaId + return RecordPublished( id=EventId(uuid4()), record_srn=RecordSRN( @@ -73,6 +75,7 @@ def make_record_published( ), source=DepositionSource(id=str(dep_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=SchemaId.parse("test@1.0.0"), metadata=metadata or {"title": "Test Record"}, ) diff --git a/server/tests/integration/test_metadata_additive_evolve_postgres.py b/server/tests/integration/test_metadata_additive_evolve_postgres.py new file mode 100644 index 0000000..9b5f59f --- /dev/null +++ b/server/tests/integration/test_metadata_additive_evolve_postgres.py @@ -0,0 +1,116 @@ +"""Integration tests for additive schema evolution end-to-end.""" + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA + +from tests.integration.conftest import seed_record + +SCHEMA_ID = "bio-sample" +SCHEMA_V10 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") +SCHEMA_V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") + + +def _fields_v10() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _fields_v11() -> list[FieldDefinition]: + return _fields_v10() + [ + FieldDefinition( + name="collection_site", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +@pytest.mark.asyncio +class TestAdditiveEvolvePipeline: + async def test_old_row_null_new_row_typed( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + service = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + + # Register v1.0.0 and publish a record. + await service.ensure_table(SCHEMA_V10, _fields_v10()) + r_old = RecordSRN.parse("urn:osa:localhost:rec:old@1") + await seed_record( + pg_engine, + srn=str(r_old), + schema_id=SCHEMA_V10.id.root, + schema_version=SCHEMA_V10.version.root, + ) + await service.insert(SCHEMA_V10, r_old, {"species": "Mus musculus"}) + await pg_session.commit() + + # Bump to v1.1.0 (additive) and publish another record carrying the new field. + await service.ensure_table(SCHEMA_V11, _fields_v11()) + r_new = RecordSRN.parse("urn:osa:localhost:rec:new@1") + await seed_record( + pg_engine, + srn=str(r_new), + schema_id=SCHEMA_V11.id.root, + schema_version=SCHEMA_V11.version.root, + ) + await service.insert( + SCHEMA_V11, r_new, {"species": "Homo sapiens", "collection_site": "Lab A"} + ) + await pg_session.commit() + + # Old row: NULL in new column. + async with pg_engine.begin() as conn: + old_site = ( + await conn.execute( + text( + f'SELECT collection_site FROM "{METADATA_SCHEMA}"."bio_sample_v1" ' + f"WHERE record_srn = :srn" + ), + {"srn": str(r_old)}, + ) + ).scalar() + new_site = ( + await conn.execute( + text( + f'SELECT collection_site FROM "{METADATA_SCHEMA}"."bio_sample_v1" ' + f"WHERE record_srn = :srn" + ), + {"srn": str(r_new)}, + ) + ).scalar() + assert old_site is None + assert new_site == "Lab A" + + async def test_catalog_lineage_has_both_srns( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + service = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await service.ensure_table(SCHEMA_V10, _fields_v10()) + await service.ensure_table(SCHEMA_V11, _fields_v11()) + + async with pg_engine.begin() as conn: + versions = ( + await conn.execute( + text( + "SELECT schema_versions FROM metadata_tables " + "WHERE schema_id = :id AND schema_major = 1" + ), + {"id": SCHEMA_ID}, + ) + ).scalar() + assert str(SCHEMA_V10) in versions + assert str(SCHEMA_V11) in versions diff --git a/server/tests/integration/test_non_additive_rejected_postgres.py b/server/tests/integration/test_non_additive_rejected_postgres.py new file mode 100644 index 0000000..08eb4a6 --- /dev/null +++ b/server/tests/integration/test_non_additive_rejected_postgres.py @@ -0,0 +1,104 @@ +"""Integration tests for non-additive schema evolution rejection (FR-023).""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaId +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +SCHEMA_ID = "bio-sample" +V1 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") +V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") + + +def _orig() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +@pytest.mark.asyncio +class TestNonAdditiveRejected: + async def test_rename_field_rejected(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await svc.ensure_table(V1, _orig()) + + # New field "organism" is optional so the validator reaches the removal + # check and reports the dropped "species" field specifically. + renamed = [ + FieldDefinition( + name="organism", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + with pytest.raises(ValidationError) as exc: + await svc.ensure_table(V11, renamed) + message = str(exc.value) + assert "species" in message and "removed" in message + + async def test_type_change_rejected(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await svc.ensure_table(V1, _orig()) + + retyped = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + # Previously NUMBER, now TEXT — retype is non-additive. + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + with pytest.raises(ValidationError, match="resolution"): + await svc.ensure_table(V11, retyped) + + async def test_tightening_required_rejected( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await svc.ensure_table(V1, _orig()) + + tightened = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=True, # was False + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + with pytest.raises(ValidationError, match="resolution"): + await svc.ensure_table(V11, tightened) diff --git a/server/tests/unit/domain/deposition/test_convention.py b/server/tests/unit/domain/deposition/test_convention.py index b45881d..29bb09b 100644 --- a/server/tests/unit/domain/deposition/test_convention.py +++ b/server/tests/unit/domain/deposition/test_convention.py @@ -4,15 +4,15 @@ from osa.domain.deposition.model.convention import Convention from osa.domain.deposition.model.value import FileRequirements -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId def _make_conv_srn(id: str = "test-conv", version: str = "1.0.0") -> ConventionSRN: return ConventionSRN.parse(f"urn:osa:localhost:conv:{id}@{version}") -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_file_reqs() -> FileRequirements: @@ -29,12 +29,12 @@ def test_create_with_required_fields(self): conv = Convention( srn=_make_conv_srn(), title="scRNA-seq Submission", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) assert conv.title == "scRNA-seq Submission" - assert conv.schema_srn == _make_schema_srn() + assert conv.schema_id == _make_schema_id() assert conv.file_requirements.max_count == 5 def test_create_with_description(self): @@ -42,7 +42,7 @@ def test_create_with_description(self): srn=_make_conv_srn(), title="Test", description="A test convention", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) @@ -52,7 +52,7 @@ def test_create_with_empty_hooks(self): conv = Convention( srn=_make_conv_srn(), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), hooks=[], created_at=datetime.now(UTC), @@ -65,7 +65,7 @@ def test_srn_is_versioned(self): conv = Convention( srn=_make_conv_srn("my-conv", "2.0.0"), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/deposition/test_convention_registered.py b/server/tests/unit/domain/deposition/test_convention_registered.py index c97a4e9..71352e0 100644 --- a/server/tests/unit/domain/deposition/test_convention_registered.py +++ b/server/tests/unit/domain/deposition/test_convention_registered.py @@ -14,13 +14,17 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") + + def _make_hook_definition(name: str = "pocket_detect") -> HookDefinition: return HookDefinition( name=name, @@ -42,6 +46,8 @@ def test_event_carries_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_id=_make_schema_id(), + schema_fields=[], hooks=hooks, ) @@ -54,6 +60,7 @@ def test_event_defaults_to_empty_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_id=_make_schema_id(), ) assert event.hooks == [] @@ -64,6 +71,8 @@ def test_serialization_with_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_id=_make_schema_id(), + schema_fields=[], hooks=hooks, ) diff --git a/server/tests/unit/domain/deposition/test_convention_service.py b/server/tests/unit/domain/deposition/test_convention_service.py index 7a765d9..59ea3fc 100644 --- a/server/tests/unit/domain/deposition/test_convention_service.py +++ b/server/tests/unit/domain/deposition/test_convention_service.py @@ -16,15 +16,15 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN, Domain, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, Domain, SchemaId, SchemaIdentifier def _make_conv_srn(id: str = "test-conv", version: str = "1.0.0") -> ConventionSRN: return ConventionSRN.parse(f"urn:osa:localhost:conv:{id}@{version}") -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_field_defs() -> list[FieldDefinition]: @@ -71,12 +71,14 @@ def _make_service( mock_schema_service = schema_service or AsyncMock() if not schema_service: mock_schema = AsyncMock() - mock_schema.srn = _make_schema_srn() + mock_schema.id = _make_schema_id() + mock_schema.fields = [] mock_schema_service.create_schema.return_value = mock_schema return ConventionService( convention_repo=conv_repo or AsyncMock(), schema_service=mock_schema_service, + metadata_service=AsyncMock(), outbox=outbox or AsyncMock(), node_domain=Domain("localhost"), ) @@ -88,11 +90,13 @@ async def test_create_convention_creates_schema(self): conv_repo = AsyncMock() schema_service = AsyncMock() mock_schema = AsyncMock() - mock_schema.srn = _make_schema_srn() + mock_schema.id = _make_schema_id() + mock_schema.fields = [] schema_service.create_schema.return_value = mock_schema service = _make_service(conv_repo, schema_service) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="Test Convention", version="1.0.0", schema=_make_field_defs(), @@ -106,6 +110,7 @@ async def test_create_convention_creates_schema(self): async def test_create_convention_generates_srn(self): service = _make_service() result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="Test", version="1.0.0", schema=_make_field_defs(), @@ -119,6 +124,7 @@ async def test_create_convention_with_hooks_emits_hooks_in_event(self): service = _make_service(outbox=outbox) hooks = [_make_hook_def()] result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Hooks", version="1.0.0", schema=_make_field_defs(), @@ -136,6 +142,7 @@ async def test_create_convention_without_hooks_emits_empty_hooks(self): outbox = AsyncMock() service = _make_service(outbox=outbox) await service.create_convention( + id=SchemaIdentifier("test-schema"), title="No Hooks", version="1.0.0", schema=_make_field_defs(), @@ -151,7 +158,7 @@ async def test_get_existing(self): conv = Convention( srn=_make_conv_srn(), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) @@ -178,7 +185,7 @@ async def test_list_conventions(self): conv = Convention( srn=_make_conv_srn(), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/deposition/test_convention_service_v2.py b/server/tests/unit/domain/deposition/test_convention_service_v2.py index 9663e22..1e8557e 100644 --- a/server/tests/unit/domain/deposition/test_convention_service_v2.py +++ b/server/tests/unit/domain/deposition/test_convention_service_v2.py @@ -15,7 +15,7 @@ TableFeatureSpec, ) from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import Domain, SchemaSRN +from osa.domain.shared.model.srn import Domain, SchemaId, SchemaIdentifier def _make_field_defs() -> list[FieldDefinition]: @@ -78,12 +78,13 @@ def _make_service( # Default: create_schema returns a Schema-like obj with .srn if not schema_service: mock_schema = AsyncMock() - mock_schema.srn = SchemaSRN.parse("urn:osa:localhost:schema:testschema12345678@1.0.0") + mock_schema.id = SchemaId.parse("testschema12345678@1.0.0") mock_schema_service.create_schema.return_value = mock_schema return ConventionService( convention_repo=conv_repo or AsyncMock(), schema_service=mock_schema_service, + metadata_service=AsyncMock(), outbox=outbox or AsyncMock(), node_domain=Domain("localhost"), ) @@ -94,11 +95,12 @@ class TestCreateConventionWithInlineSchema: async def test_creates_schema_from_field_definitions(self): schema_service = AsyncMock() mock_schema = AsyncMock() - mock_schema.srn = SchemaSRN.parse("urn:osa:localhost:schema:testschema12345678@1.0.0") + mock_schema.id = SchemaId.parse("testschema12345678@1.0.0") schema_service.create_schema.return_value = mock_schema service = _make_service(schema_service=schema_service) await service.create_convention( + id=SchemaIdentifier("test-schema"), title="PDB Structures", version="1.0.0", schema=_make_field_defs(), @@ -112,27 +114,29 @@ async def test_creates_schema_from_field_definitions(self): assert len(call_kwargs[1]["fields"]) == 2 @pytest.mark.asyncio - async def test_convention_references_created_schema_srn(self): + async def test_convention_references_created_schema_id(self): schema_service = AsyncMock() - schema_srn = SchemaSRN.parse("urn:osa:localhost:schema:created123456789@1.0.0") + schema_id = SchemaId.parse("created123456789@1.0.0") mock_schema = AsyncMock() - mock_schema.srn = schema_srn + mock_schema.id = schema_id schema_service.create_schema.return_value = mock_schema service = _make_service(schema_service=schema_service) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="Test", version="1.0.0", schema=_make_field_defs(), file_requirements=_make_file_reqs(), ) - assert result.schema_srn == schema_srn + assert result.schema_id == schema_id @pytest.mark.asyncio async def test_convention_saves_ingester_definition(self): service = _make_service() ingester = _make_ingester_def() result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Ingester", version="1.0.0", schema=_make_field_defs(), @@ -148,6 +152,7 @@ async def test_convention_saves_ingester_definition(self): async def test_convention_ingester_defaults_to_none(self): service = _make_service() result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="No Ingester", version="1.0.0", schema=_make_field_defs(), @@ -161,6 +166,7 @@ async def test_convention_with_hooks_emits_hooks_in_event(self): service = _make_service(outbox=outbox) hooks = [_make_hook_def()] await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Hooks", version="1.0.0", schema=_make_field_defs(), @@ -178,6 +184,7 @@ async def test_create_convention_emits_convention_registered(self): outbox = AsyncMock() service = _make_service(outbox=outbox) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Source", version="1.0.0", schema=_make_field_defs(), @@ -194,6 +201,7 @@ async def test_create_convention_without_source_still_emits_event(self): outbox = AsyncMock() service = _make_service(outbox=outbox) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="No Source", version="1.0.0", schema=_make_field_defs(), diff --git a/server/tests/unit/domain/deposition/test_deposition_service.py b/server/tests/unit/domain/deposition/test_deposition_service.py index 69a5494..53aeba3 100644 --- a/server/tests/unit/domain/deposition/test_deposition_service.py +++ b/server/tests/unit/domain/deposition/test_deposition_service.py @@ -21,7 +21,7 @@ from osa.domain.deposition.event.submitted import DepositionSubmittedEvent from osa.domain.deposition.service.deposition import DepositionService from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import ConventionSRN, DepositionSRN, Domain, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, DepositionSRN, Domain, SchemaId def _make_dep_srn(id: str = "test-dep") -> DepositionSRN: @@ -32,8 +32,8 @@ def _make_conv_srn(id: str = "test-conv", version: str = "1.0.0") -> ConventionS return ConventionSRN.parse(f"urn:osa:localhost:conv:{id}@{version}") -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_file_reqs(**overrides) -> FileRequirements: @@ -51,7 +51,7 @@ def _make_convention(**overrides) -> Convention: defaults = dict( srn=_make_conv_srn(), title="Test Convention", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/deposition/test_event_chain.py b/server/tests/unit/domain/deposition/test_event_chain.py index 3a6243a..31ac8a1 100644 --- a/server/tests/unit/domain/deposition/test_event_chain.py +++ b/server/tests/unit/domain/deposition/test_event_chain.py @@ -26,6 +26,7 @@ ConventionSRN, DepositionSRN, RecordSRN, + SchemaId, ValidationRunSRN, ) from osa.domain.validation.event.validation_completed import ValidationCompleted @@ -33,6 +34,10 @@ from osa.domain.validation.model import RunStatus +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") + + def _make_dep_srn() -> DepositionSRN: return DepositionSRN.parse("urn:osa:localhost:dep:test-dep") @@ -263,6 +268,7 @@ async def test_delegates_to_feature_service(self): source=DepositionSource(id=str(_make_dep_srn())), metadata={"title": "Test"}, convention_srn=_make_conv_srn(), + schema_id=_make_schema_id(), expected_features=["pocket_detect"], ) await handler.handle(event) diff --git a/server/tests/unit/domain/deposition/test_spreadsheet.py b/server/tests/unit/domain/deposition/test_spreadsheet.py index 5a077de..30ace17 100644 --- a/server/tests/unit/domain/deposition/test_spreadsheet.py +++ b/server/tests/unit/domain/deposition/test_spreadsheet.py @@ -12,12 +12,12 @@ FieldType, TermConstraints, ) -from osa.domain.shared.model.srn import OntologySRN, SchemaSRN +from osa.domain.shared.model.srn import OntologySRN, SchemaId from osa.infrastructure.persistence.adapter.spreadsheet import OpenpyxlSpreadsheetAdapter -def _make_schema_srn() -> SchemaSRN: - return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") def _make_ontology_srn() -> OntologySRN: @@ -49,7 +49,7 @@ def _make_term_field( def _make_schema(fields: list[FieldDefinition] | None = None) -> Schema: return Schema( - srn=_make_schema_srn(), + id=_make_schema_id(), title="Test Schema", fields=fields or [_make_text_field()], created_at=datetime.now(UTC), diff --git a/server/tests/unit/domain/discovery/test_discovery_service.py b/server/tests/unit/domain/discovery/test_discovery_service.py index c38a79d..b68dbba 100644 --- a/server/tests/unit/domain/discovery/test_discovery_service.py +++ b/server/tests/unit/domain/discovery/test_discovery_service.py @@ -1,23 +1,40 @@ -"""Tests for DiscoveryService — filter validation, operator validation, delegation.""" +"""Tests for DiscoveryService — FilterExpr validation, operator validation, delegation.""" from datetime import UTC, datetime from unittest.mock import AsyncMock import pytest +from osa.config import Config +from osa.domain.discovery.model.refs import MetadataFieldRef from osa.domain.discovery.model.value import ( + And, ColumnInfo, FeatureCatalogEntry, FeatureRow, - Filter, FilterOperator, + Predicate, RecordSummary, SortOrder, + decode_cursor, + encode_cursor, ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.semantics.model.value import FieldType -from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.error import NotFoundError, ValidationError +from osa.domain.shared.model.srn import RecordSRN, SchemaId + + +SCHEMA_SRN = SchemaId.parse("bio-sample@1.0.0") + + +def _config() -> Config: + # Build a Config with minimal auth — tests don't hit JWT paths + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) # Test-only secret + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] @pytest.fixture @@ -38,19 +55,37 @@ def mock_field_reader() -> AsyncMock: "is_public": FieldType.BOOLEAN, "homepage": FieldType.URL, } + reader.get_fields_for_schema.return_value = { + "title": FieldType.TEXT, + "resolution": FieldType.NUMBER, + "method": FieldType.TERM, + "published_date": FieldType.DATE, + "is_public": FieldType.BOOLEAN, + "homepage": FieldType.URL, + } return reader @pytest.fixture def service(mock_read_store: AsyncMock, mock_field_reader: AsyncMock) -> DiscoveryService: - return DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + return DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) + + +def _eq(field: str, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=FilterOperator.EQ, value=value) class TestSearchRecordsValidation: async def test_rejects_unknown_filter_field(self, service: DiscoveryService) -> None: - with pytest.raises(ValidationError, match="Unknown field 'bogus'"): + with pytest.raises(ValidationError, match="Unknown metadata field 'bogus'"): await service.search_records( - filters=[Filter(field="bogus", operator=FilterOperator.EQ, value="x")], + filter_expr=_eq("bogus", "x"), + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -59,9 +94,15 @@ async def test_rejects_unknown_filter_field(self, service: DiscoveryService) -> ) async def test_rejects_invalid_operator_for_type(self, service: DiscoveryService) -> None: - with pytest.raises(ValidationError, match="contains"): + with pytest.raises(ValidationError, match="not valid"): await service.search_records( - filters=[Filter(field="resolution", operator=FilterOperator.CONTAINS, value="x")], + filter_expr=Predicate( + field=MetadataFieldRef(field="resolution"), + op=FilterOperator.CONTAINS, + value="x", + ), + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -72,7 +113,9 @@ async def test_rejects_invalid_operator_for_type(self, service: DiscoveryService async def test_rejects_unknown_sort_field(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="Unknown sort field"): await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="nonexistent", order=SortOrder.DESC, @@ -82,7 +125,9 @@ async def test_rejects_unknown_sort_field(self, service: DiscoveryService) -> No async def test_accepts_published_at_sort(self, service: DiscoveryService) -> None: result = await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -93,7 +138,9 @@ async def test_accepts_published_at_sort(self, service: DiscoveryService) -> Non async def test_accepts_metadata_field_sort(self, service: DiscoveryService) -> None: result = await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="resolution", order=SortOrder.ASC, @@ -105,7 +152,9 @@ async def test_accepts_metadata_field_sort(self, service: DiscoveryService) -> N async def test_rejects_limit_too_low(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="limit"): await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -116,7 +165,9 @@ async def test_rejects_limit_too_low(self, service: DiscoveryService) -> None: async def test_rejects_limit_too_high(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="limit"): await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -124,17 +175,75 @@ async def test_rejects_limit_too_high(self, service: DiscoveryService) -> None: limit=101, ) + async def test_raises_not_found_for_unknown_schema(self, mock_read_store: AsyncMock) -> None: + """Pinning an unregistered schema must raise NotFoundError, not silently + fall through to an unscoped query that returns cross-schema records.""" + empty_reader = AsyncMock() + empty_reader.get_fields_for_schema.return_value = {} + svc = DiscoveryService( + read_store=mock_read_store, + field_reader=empty_reader, + config=_config(), + ) + + with pytest.raises(NotFoundError, match="Schema not found"): + await svc.search_records( + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + mock_read_store.search_records.assert_not_called() + + async def test_search_features_raises_not_found_for_unknown_schema( + self, mock_read_store: AsyncMock + ) -> None: + """search_features must also guard against unknown schema pins.""" + mock_read_store.get_feature_table_schema.return_value = FeatureCatalogEntry( + hook_name="detect_pockets", + columns=[ColumnInfo(name="score", type="number", required=False)], + record_count=0, + ) + empty_reader = AsyncMock() + empty_reader.get_fields_for_schema.return_value = {} + svc = DiscoveryService( + read_store=mock_read_store, + field_reader=empty_reader, + config=_config(), + ) + + with pytest.raises(NotFoundError, match="Schema not found"): + await svc.search_features( + hook_name="detect_pockets", + filter_expr=None, + schema_id=SCHEMA_SRN, + record_srn=None, + sort="id", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + mock_read_store.search_features.assert_not_called() + async def test_rejects_q_when_no_text_fields(self, mock_read_store: AsyncMock) -> None: - """q should raise when no TEXT/URL fields exist to search against.""" no_text_reader = AsyncMock() - no_text_reader.get_all_field_types.return_value = { - "resolution": FieldType.NUMBER, - } - svc = DiscoveryService(read_store=mock_read_store, field_reader=no_text_reader) + no_text_reader.get_all_field_types.return_value = {"resolution": FieldType.NUMBER} + no_text_reader.get_fields_for_schema.return_value = {"resolution": FieldType.NUMBER} + svc = DiscoveryService( + read_store=mock_read_store, + field_reader=no_text_reader, + config=_config(), + ) with pytest.raises(ValidationError, match="Free-text search is unavailable"): await svc.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q="kinase", sort="published_at", order=SortOrder.DESC, @@ -148,7 +257,9 @@ async def test_delegates_to_read_store( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: await service.search_records( - filters=[Filter(field="method", operator=FilterOperator.EQ, value="X-ray")], + filter_expr=_eq("method", "X-ray"), + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -158,38 +269,19 @@ async def test_delegates_to_read_store( mock_read_store.search_records.assert_called_once() call_kwargs = mock_read_store.search_records.call_args - assert len(call_kwargs.kwargs["filters"]) == 1 + assert call_kwargs.kwargs["filter_expr"] is not None assert call_kwargs.kwargs["q"] is None assert call_kwargs.kwargs["sort"] == "published_at" - assert call_kwargs.kwargs["limit"] == 21 # N+1 trick - - async def test_extracts_text_fields_for_q( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - await service.search_records( - filters=[], - q="kinase", - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=20, - ) - - call_kwargs = mock_read_store.search_records.call_args - text_fields = call_kwargs.kwargs["text_fields"] - # title (TEXT) and homepage (URL) are text-searchable - assert "title" in text_fields - assert "homepage" in text_fields - assert "resolution" not in text_fields + assert call_kwargs.kwargs["limit"] == 21 # N+1 async def test_decodes_cursor( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - from osa.domain.discovery.model.value import encode_cursor - cursor = encode_cursor("2026-01-01", "urn:osa:localhost:rec:abc@1") await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -205,7 +297,9 @@ async def test_decodes_cursor( async def test_invalid_cursor_raises(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="cursor"): await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -218,13 +312,14 @@ async def test_encodes_next_cursor_from_results( ) -> None: srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") ts = datetime(2026, 1, 1, tzinfo=UTC) - # Return limit+1 rows so the service detects has_more=True mock_read_store.search_records.return_value = [ RecordSummary(srn=srn, published_at=ts, metadata={"title": f"r{i}"}) for i in range(2) ] result = await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -234,10 +329,7 @@ async def test_encodes_next_cursor_from_results( assert result.has_more is True assert result.cursor is not None - assert len(result.results) == 1 # trimmed back to limit - - from osa.domain.discovery.model.value import decode_cursor - + assert len(result.results) == 1 decoded = decode_cursor(result.cursor) assert decoded["id"] == str(srn) @@ -247,7 +339,9 @@ async def test_no_cursor_when_no_more_results( mock_read_store.search_records.return_value = [] result = await service.search_records( - filters=[], + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -259,111 +353,114 @@ async def test_no_cursor_when_no_more_results( assert result.has_more is False -class TestSearchRecordsPagination: - async def test_has_more_false_when_exactly_limit_rows( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - """Exactly limit rows should NOT report has_more (no false positive).""" - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") - ts = datetime(2026, 1, 1, tzinfo=UTC) - mock_read_store.search_records.return_value = [ - RecordSummary(srn=srn, published_at=ts, metadata={"title": f"r{i}"}) for i in range(3) - ] - - result = await service.search_records( - filters=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=3, - ) - - assert result.has_more is False - assert result.cursor is None - assert len(result.results) == 3 +class TestSchemaRequiredGuards: + """With the JSONB filter fallback removed, any query that resolves against + metadata fields must pin a schema.""" - async def test_has_more_true_when_more_than_limit_rows( - self, service: DiscoveryService, mock_read_store: AsyncMock + async def test_metadata_predicate_without_schema_raises( + self, service: DiscoveryService ) -> None: - """Adapter returning limit+1 rows signals more pages exist.""" - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") - ts = datetime(2026, 1, 1, tzinfo=UTC) - mock_read_store.search_records.return_value = [ - RecordSummary(srn=srn, published_at=ts, metadata={"title": f"r{i}"}) for i in range(4) - ] + with pytest.raises(ValidationError) as exc: + await service.search_records( + filter_expr=_eq("title", "x"), + schema_id=None, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + assert exc.value.code == "schema_required_for_metadata_query" - result = await service.search_records( - filters=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=3, - ) + async def test_non_default_sort_without_schema_raises(self, service: DiscoveryService) -> None: + with pytest.raises(ValidationError) as exc: + await service.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, + q=None, + sort="resolution", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + assert exc.value.code == "schema_required_for_metadata_sort" - assert result.has_more is True - assert result.cursor is not None - assert len(result.results) == 3 # trimmed back to limit + async def test_q_without_schema_raises(self, service: DiscoveryService) -> None: + with pytest.raises(ValidationError) as exc: + await service.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, + q="kinase", + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + assert exc.value.code == "schema_required_for_free_text_search" - async def test_passes_limit_plus_one_to_read_store( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - """Service should fetch one extra row to detect more pages.""" - await service.search_records( - filters=[], + async def test_plain_listing_without_schema_succeeds(self, service: DiscoveryService) -> None: + """No filter, default sort, no q → unscoped listing is allowed.""" + result = await service.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, cursor=None, limit=20, ) + assert result.results == [] - call_kwargs = mock_read_store.search_records.call_args - assert call_kwargs.kwargs["limit"] == 21 +class TestFilterBounds: + async def test_depth_exceeded_raises(self, service: DiscoveryService) -> None: + # Build a nest of AND that exceeds the default depth (10) + leaf = _eq("title", "r") + tree = leaf + for _ in range(11): + tree = And(operands=[tree, leaf]) -class TestSearchRecordsFieldTypes: - async def test_passes_field_types_to_read_store( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - await service.search_records( - filters=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=20, - ) - - call_kwargs = mock_read_store.search_records.call_args - field_types = call_kwargs.kwargs["field_types"] - assert field_types["resolution"] == FieldType.NUMBER - assert field_types["title"] == FieldType.TEXT + with pytest.raises(ValidationError, match="filter_depth_exceeded|depth"): + await service.search_records( + filter_expr=tree, + schema_id=SCHEMA_SRN, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) class TestFeatureCursorEncoding: async def test_cursor_encodes_row_id( self, mock_read_store: AsyncMock, mock_field_reader: AsyncMock ) -> None: - from osa.domain.discovery.model.value import decode_cursor - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") mock_read_store.get_feature_table_schema.return_value = FeatureCatalogEntry( hook_name="detect_pockets", columns=[ColumnInfo(name="score", type="number", required=True)], record_count=0, ) - # Return limit+1 rows so the service detects has_more=True mock_read_store.search_features.return_value = [ FeatureRow(row_id=42, record_srn=srn, data={"score": 7.66}), FeatureRow(row_id=43, record_srn=srn, data={"score": 6.0}), ] - service = DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + service = DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) result = await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -377,39 +474,3 @@ async def test_cursor_encodes_row_id( decoded = decode_cursor(result.cursor) assert decoded["id"] == 42 assert decoded["s"] == 7.66 - - async def test_cursor_uses_row_id_for_id_sort( - self, mock_read_store: AsyncMock, mock_field_reader: AsyncMock - ) -> None: - from osa.domain.discovery.model.value import decode_cursor - - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") - mock_read_store.get_feature_table_schema.return_value = FeatureCatalogEntry( - hook_name="detect_pockets", - columns=[ColumnInfo(name="score", type="number", required=True)], - record_count=0, - ) - # Return limit+1 rows so the service detects has_more=True - mock_read_store.search_features.return_value = [ - FeatureRow(row_id=99, record_srn=srn, data={"score": 5.0}), - FeatureRow(row_id=98, record_srn=srn, data={"score": 4.0}), - ] - - service = DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) - result = await service.search_features( - hook_name="detect_pockets", - filters=[], - record_srn=None, - sort="id", - order=SortOrder.DESC, - cursor=None, - limit=1, - ) - - assert result.has_more is True - assert result.cursor is not None - assert len(result.rows) == 1 - decoded = decode_cursor(result.cursor) - # When sort is "id", sort_val should be the row_id itself - assert decoded["s"] == 99 - assert decoded["id"] == 99 diff --git a/server/tests/unit/domain/discovery/test_get_feature_catalog.py b/server/tests/unit/domain/discovery/test_get_feature_catalog.py index 6cfb1fe..f6197e3 100644 --- a/server/tests/unit/domain/discovery/test_get_feature_catalog.py +++ b/server/tests/unit/domain/discovery/test_get_feature_catalog.py @@ -10,9 +10,18 @@ GetFeatureCatalogHandler, GetFeatureCatalogResult, ) +from osa.config import Config from osa.domain.discovery.service.discovery import DiscoveryService +def _config() -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] + + @pytest.fixture def mock_read_store() -> AsyncMock: return AsyncMock() @@ -22,12 +31,17 @@ def mock_read_store() -> AsyncMock: def mock_field_reader() -> AsyncMock: reader = AsyncMock() reader.get_all_field_types.return_value = {} + reader.get_fields_for_schema.return_value = {} return reader @pytest.fixture def service(mock_read_store: AsyncMock, mock_field_reader: AsyncMock) -> DiscoveryService: - return DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + return DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) class TestGetFeatureCatalogHandler: diff --git a/server/tests/unit/domain/discovery/test_search_features.py b/server/tests/unit/domain/discovery/test_search_features.py index 8c46b1f..b89494b 100644 --- a/server/tests/unit/domain/discovery/test_search_features.py +++ b/server/tests/unit/domain/discovery/test_search_features.py @@ -4,13 +4,15 @@ import pytest +from osa.config import Config +from osa.domain.discovery.model.refs import FeatureFieldRef from osa.domain.discovery.model.value import ( ColumnInfo, FeatureCatalogEntry, FeatureRow, FeatureSearchResult, - Filter, FilterOperator, + Predicate, SortOrder, ) from osa.domain.discovery.query.search_features import ( @@ -35,6 +37,18 @@ def _make_catalog_entry() -> FeatureCatalogEntry: ) +def _config() -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] + + +def _predicate(hook: str, column: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=FeatureFieldRef(hook=hook, column=column), op=op, value=value) + + @pytest.fixture def mock_read_store() -> AsyncMock: store = AsyncMock() @@ -47,12 +61,17 @@ def mock_read_store() -> AsyncMock: def mock_field_reader() -> AsyncMock: reader = AsyncMock() reader.get_all_field_types.return_value = {} + reader.get_fields_for_schema.return_value = {} return reader @pytest.fixture def service(mock_read_store: AsyncMock, mock_field_reader: AsyncMock) -> DiscoveryService: - return DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + return DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) class TestSearchFeaturesHandler: @@ -103,10 +122,11 @@ async def test_raises_not_found_for_unknown_hook( ) -> None: mock_read_store.get_feature_table_schema.return_value = None - with pytest.raises(NotFoundError, match="unknown_hook"): + with pytest.raises(NotFoundError): await service.search_features( hook_name="unknown_hook", - filters=[], + filter_expr=None, + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -118,7 +138,8 @@ async def test_rejects_unknown_column(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="bogus"): await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="bogus", operator=FilterOperator.EQ, value=1)], + filter_expr=_predicate("detect_pockets", "bogus", FilterOperator.EQ, 1), + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -130,19 +151,8 @@ async def test_validates_operator_for_number_column(self, service: DiscoveryServ with pytest.raises(ValidationError, match="contains"): await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="score", operator=FilterOperator.CONTAINS, value="x")], - record_srn=None, - sort="id", - order=SortOrder.DESC, - cursor=None, - limit=50, - ) - - async def test_validates_operator_for_boolean_column(self, service: DiscoveryService) -> None: - with pytest.raises(ValidationError, match="gte"): - await service.search_features( - hook_name="detect_pockets", - filters=[Filter(field="is_active", operator=FilterOperator.GTE, value=True)], + filter_expr=_predicate("detect_pockets", "score", FilterOperator.CONTAINS, "x"), + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -153,7 +163,8 @@ async def test_validates_operator_for_boolean_column(self, service: DiscoverySer async def test_accepts_string_contains_operator(self, service: DiscoveryService) -> None: await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="label", operator=FilterOperator.CONTAINS, value="test")], + filter_expr=_predicate("detect_pockets", "label", FilterOperator.CONTAINS, "test"), + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -167,7 +178,8 @@ async def test_passes_record_srn_filter( srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_id=None, record_srn=srn, sort="id", order=SortOrder.DESC, @@ -178,33 +190,13 @@ async def test_passes_record_srn_filter( call_kwargs = mock_read_store.search_features.call_args assert call_kwargs.kwargs["record_srn"] == srn - async def test_decodes_cursor( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - from osa.domain.discovery.model.value import encode_cursor - - cursor = encode_cursor(7.66, 42) - await service.search_features( - hook_name="detect_pockets", - filters=[], - record_srn=None, - sort="score", - order=SortOrder.DESC, - cursor=cursor, - limit=50, - ) - - call_kwargs = mock_read_store.search_features.call_args - decoded = call_kwargs.kwargs["cursor"] - assert decoded["s"] == 7.66 - assert decoded["id"] == 42 - async def test_delegates_to_read_store( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="score", operator=FilterOperator.GTE, value=6.0)], + filter_expr=_predicate("detect_pockets", "score", FilterOperator.GTE, 6.0), + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -215,14 +207,13 @@ async def test_delegates_to_read_store( mock_read_store.search_features.assert_called_once() call_kwargs = mock_read_store.search_features.call_args assert call_kwargs.kwargs["hook_name"] == "detect_pockets" - assert len(call_kwargs.kwargs["filters"]) == 1 + assert call_kwargs.kwargs["filter_expr"] is not None class TestSearchFeaturesPagination: async def test_has_more_false_when_exactly_limit_rows( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - """Exactly limit rows should NOT report has_more (no false positive).""" srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") mock_read_store.search_features.return_value = [ FeatureRow(row_id=i, record_srn=srn, data={"score": float(i)}) for i in range(3) @@ -230,7 +221,8 @@ async def test_has_more_false_when_exactly_limit_rows( result = await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -245,7 +237,6 @@ async def test_has_more_false_when_exactly_limit_rows( async def test_has_more_true_when_more_than_limit_rows( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - """Adapter returning limit+1 rows signals more pages exist.""" srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") mock_read_store.search_features.return_value = [ FeatureRow(row_id=i, record_srn=srn, data={"score": float(i)}) for i in range(4) @@ -253,7 +244,8 @@ async def test_has_more_true_when_more_than_limit_rows( result = await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -268,10 +260,10 @@ async def test_has_more_true_when_more_than_limit_rows( async def test_passes_limit_plus_one_to_read_store( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - """Service should fetch one extra row to detect more pages.""" await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, diff --git a/server/tests/unit/domain/discovery/test_search_records.py b/server/tests/unit/domain/discovery/test_search_records.py index c713a5b..b9a7fbd 100644 --- a/server/tests/unit/domain/discovery/test_search_records.py +++ b/server/tests/unit/domain/discovery/test_search_records.py @@ -40,7 +40,9 @@ async def test_delegates_to_service( await handler.run(cmd) mock_service.search_records.assert_called_once_with( - filters=[], + filter_expr=None, + schema_id=None, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, diff --git a/server/tests/unit/domain/discovery/test_value.py b/server/tests/unit/domain/discovery/test_value.py index 0accb29..18518f5 100644 --- a/server/tests/unit/domain/discovery/test_value.py +++ b/server/tests/unit/domain/discovery/test_value.py @@ -69,31 +69,38 @@ def test_non_dict_payload(self) -> None: class TestValidOperators: - def test_text_operators(self) -> None: - assert VALID_OPERATORS[FieldType.TEXT] == {FilterOperator.EQ, FilterOperator.CONTAINS} - - def test_url_operators(self) -> None: - assert VALID_OPERATORS[FieldType.URL] == {FilterOperator.EQ, FilterOperator.CONTAINS} - - def test_number_operators(self) -> None: - assert VALID_OPERATORS[FieldType.NUMBER] == { - FilterOperator.EQ, - FilterOperator.GTE, - FilterOperator.LTE, - } - - def test_date_operators(self) -> None: - assert VALID_OPERATORS[FieldType.DATE] == { - FilterOperator.EQ, - FilterOperator.GTE, - FilterOperator.LTE, - } + def test_text_operators_include_basics(self) -> None: + ops = VALID_OPERATORS[FieldType.TEXT] + assert FilterOperator.EQ in ops + assert FilterOperator.CONTAINS in ops + assert FilterOperator.IN in ops + assert FilterOperator.NEQ in ops + + def test_url_operators_include_basics(self) -> None: + ops = VALID_OPERATORS[FieldType.URL] + assert FilterOperator.EQ in ops + assert FilterOperator.CONTAINS in ops + assert FilterOperator.IN in ops + + def test_number_operators_support_ordering(self) -> None: + ops = VALID_OPERATORS[FieldType.NUMBER] + assert FilterOperator.EQ in ops + assert FilterOperator.GT in ops + assert FilterOperator.GTE in ops + assert FilterOperator.LT in ops + assert FilterOperator.LTE in ops + + def test_date_operators_support_ordering(self) -> None: + ops = VALID_OPERATORS[FieldType.DATE] + assert FilterOperator.GTE in ops + assert FilterOperator.LTE in ops def test_boolean_operators(self) -> None: - assert VALID_OPERATORS[FieldType.BOOLEAN] == {FilterOperator.EQ} + assert FilterOperator.EQ in VALID_OPERATORS[FieldType.BOOLEAN] + assert FilterOperator.IS_NULL in VALID_OPERATORS[FieldType.BOOLEAN] def test_term_operators(self) -> None: - assert VALID_OPERATORS[FieldType.TERM] == {FilterOperator.EQ} + assert FilterOperator.EQ in VALID_OPERATORS[FieldType.TERM] def test_all_field_types_have_operators(self) -> None: for ft in FieldType: diff --git a/server/tests/unit/domain/feature/test_convention_ready.py b/server/tests/unit/domain/feature/test_convention_ready.py deleted file mode 100644 index 8f5ad1e..0000000 --- a/server/tests/unit/domain/feature/test_convention_ready.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Unit tests for ConventionReady event. - -Tests for User Story 2: Convention Initialization Chain. -""" - -from uuid import uuid4 - -from osa.domain.feature.event.convention_ready import ConventionReady -from osa.domain.shared.event import EventId -from osa.domain.shared.model.srn import ConventionSRN - - -def _make_conv_srn() -> ConventionSRN: - return ConventionSRN.parse("urn:osa:localhost:conv:test-conv@1.0.0") - - -class TestConventionReady: - def test_creation_with_convention_srn(self): - """ConventionReady event carries convention_srn.""" - srn = _make_conv_srn() - event = ConventionReady(id=EventId(uuid4()), convention_srn=srn) - - assert event.convention_srn == srn - assert event.id is not None - - def test_serialization_roundtrip(self): - """ConventionReady serializes and deserializes correctly.""" - srn = _make_conv_srn() - event = ConventionReady(id=EventId(uuid4()), convention_srn=srn) - - data = event.model_dump() - restored = ConventionReady.model_validate(data) - - assert restored.convention_srn == event.convention_srn - assert restored.id == event.id - - def test_registered_in_event_registry(self): - """ConventionReady should be auto-registered in Event._registry.""" - from osa.domain.shared.event import Event - - assert "ConventionReady" in Event._registry - assert Event._registry["ConventionReady"] is ConventionReady diff --git a/server/tests/unit/domain/feature/test_create_feature_tables.py b/server/tests/unit/domain/feature/test_create_feature_tables.py index 712e68d..56466ea 100644 --- a/server/tests/unit/domain/feature/test_create_feature_tables.py +++ b/server/tests/unit/domain/feature/test_create_feature_tables.py @@ -1,7 +1,4 @@ -"""Unit tests for CreateFeatureTables event handler. - -Tests for User Story 2: Convention Initialization Chain. -""" +"""Unit tests for CreateFeatureTables event handler.""" from unittest.mock import AsyncMock from uuid import uuid4 @@ -9,7 +6,6 @@ import pytest from osa.domain.deposition.event.convention_registered import ConventionRegistered -from osa.domain.feature.event.convention_ready import ConventionReady from osa.domain.feature.handler.create_feature_tables import CreateFeatureTables from osa.domain.shared.error import ConflictError from osa.domain.shared.event import EventId @@ -19,13 +15,17 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") + + def _make_hook_definition(name: str = "pocket_detect") -> HookDefinition: return HookDefinition( name=name, @@ -44,107 +44,65 @@ def _make_event(hooks: list[HookDefinition] | None = None) -> ConventionRegister return ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_id=_make_schema_id(), + schema_fields=[], hooks=hooks or [], ) class TestCreateFeatureTables: @pytest.mark.asyncio - async def test_creates_tables_and_emits_convention_ready(self): - """Given ConventionRegistered with hooks, creates feature tables and emits ConventionReady.""" + async def test_creates_tables_for_each_hook(self): hook = _make_hook_definition() event = _make_event(hooks=[hook]) feature_service = AsyncMock() - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) feature_service.create_table.assert_called_once_with(hook) - outbox.append.assert_called_once() - emitted = outbox.append.call_args[0][0] - assert isinstance(emitted, ConventionReady) - assert emitted.convention_srn == event.convention_srn @pytest.mark.asyncio async def test_creates_multiple_tables(self): - """Creates a feature table for each hook in the event.""" hooks = [_make_hook_definition("hook_a"), _make_hook_definition("hook_b")] event = _make_event(hooks=hooks) feature_service = AsyncMock() - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) assert feature_service.create_table.call_count == 2 - outbox.append.assert_called_once() @pytest.mark.asyncio - async def test_emits_convention_ready_with_empty_hooks(self): - """Given empty hooks, still emits ConventionReady.""" + async def test_no_hooks_is_noop(self): event = _make_event(hooks=[]) feature_service = AsyncMock() - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) feature_service.create_table.assert_not_called() - outbox.append.assert_called_once() - emitted = outbox.append.call_args[0][0] - assert isinstance(emitted, ConventionReady) @pytest.mark.asyncio - async def test_does_not_emit_convention_ready_on_failure(self): - """Feature table creation failure does not emit ConventionReady.""" + async def test_propagates_non_conflict_errors(self): hook = _make_hook_definition() event = _make_event(hooks=[hook]) feature_service = AsyncMock() feature_service.create_table.side_effect = RuntimeError("DDL failed") - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) with pytest.raises(RuntimeError, match="DDL failed"): await handler.handle(event) - outbox.append.assert_not_called() - @pytest.mark.asyncio async def test_skips_existing_tables_on_redelivery(self): - """ConflictError (table already exists) is skipped; ConventionReady still emitted.""" hooks = [_make_hook_definition("hook_a"), _make_hook_definition("hook_b")] event = _make_event(hooks=hooks) feature_service = AsyncMock() feature_service.create_table.side_effect = ConflictError("table already exists") - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) assert feature_service.create_table.call_count == 2 - outbox.append.assert_called_once() - emitted = outbox.append.call_args[0][0] - assert isinstance(emitted, ConventionReady) - assert emitted.convention_srn == event.convention_srn diff --git a/server/tests/unit/domain/feature/test_insert_record_features.py b/server/tests/unit/domain/feature/test_insert_record_features.py index 9bb2a52..8de7db2 100644 --- a/server/tests/unit/domain/feature/test_insert_record_features.py +++ b/server/tests/unit/domain/feature/test_insert_record_features.py @@ -13,6 +13,7 @@ from osa.domain.shared.model.srn import ( ConventionSRN, RecordSRN, + SchemaId, ) @@ -24,6 +25,10 @@ def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") + + def _make_event( expected_features: list[str] | None = None, ) -> RecordPublished: @@ -33,6 +38,7 @@ def _make_event( source=DepositionSource(id="urn:osa:localhost:dep:test-dep"), metadata={"title": "Test"}, convention_srn=_make_conv_srn(), + schema_id=_make_schema_id(), expected_features=expected_features or [], ) @@ -219,6 +225,7 @@ async def test_ingest_source_uses_source_fields(self): ), metadata={"title": "Ingested"}, convention_srn=_make_conv_srn(), + schema_id=_make_schema_id(), expected_features=["pocket_detect"], ) await handler.handle(event) diff --git a/server/tests/unit/domain/index/test_fanout_listener.py b/server/tests/unit/domain/index/test_fanout_listener.py index 329cf08..2aa0988 100644 --- a/server/tests/unit/domain/index/test_fanout_listener.py +++ b/server/tests/unit/domain/index/test_fanout_listener.py @@ -97,6 +97,9 @@ async def test_creates_index_record_per_backend( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaId"] + ).SchemaId.parse("test@1.0.0"), metadata=sample_metadata, ) @@ -137,6 +140,9 @@ async def test_creates_unique_event_ids( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaId"] + ).SchemaId.parse("test@1.0.0"), metadata=sample_metadata, ) @@ -166,6 +172,9 @@ async def test_empty_registry_creates_no_events( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaId"] + ).SchemaId.parse("test@1.0.0"), metadata=sample_metadata, ) diff --git a/server/tests/unit/domain/record/test_get_record_handler.py b/server/tests/unit/domain/record/test_get_record_handler.py index 8fea888..206618e 100644 --- a/server/tests/unit/domain/record/test_get_record_handler.py +++ b/server/tests/unit/domain/record/test_get_record_handler.py @@ -8,7 +8,7 @@ from osa.domain.record.model.aggregate import Record from osa.domain.shared.error import NotFoundError from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId def _make_record_srn() -> RecordSRN: @@ -24,6 +24,7 @@ def _make_record() -> Record: srn=_make_record_srn(), source=DepositionSource(id="urn:osa:localhost:dep:test-dep"), convention_srn=_make_conv_srn(), + schema_id=SchemaId.parse("test@1.0.0"), metadata={"title": "Test Protein"}, published_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/record/test_record_features.py b/server/tests/unit/domain/record/test_record_features.py index e1143f4..2fe6a1e 100644 --- a/server/tests/unit/domain/record/test_record_features.py +++ b/server/tests/unit/domain/record/test_record_features.py @@ -9,7 +9,7 @@ from osa.domain.record.query.get_record import GetRecord, GetRecordHandler, RecordDetail from osa.domain.record.service.record import RecordService from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, Domain, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, Domain, RecordSRN, SchemaId from osa.infrastructure.persistence.adapter.feature_reader import PostgresFeatureReader @@ -177,6 +177,7 @@ def _make_record() -> Record: srn=RecordSRN.parse("urn:osa:localhost:rec:abc@1"), source=DepositionSource(id="urn:osa:localhost:dep:dep1"), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=SchemaId.parse("test@1.0.0"), metadata={"title": "Test"}, published_at=datetime.now(UTC), ) @@ -191,6 +192,8 @@ async def test_get_features_delegates_to_reader(self) -> None: service = RecordService( record_repo=mock_repo, + convention_repo=AsyncMock(), + metadata_service=AsyncMock(), outbox=mock_outbox, node_domain=Domain("localhost"), feature_reader=mock_reader, diff --git a/server/tests/unit/domain/record/test_record_published_enriched.py b/server/tests/unit/domain/record/test_record_published_enriched.py index bee3f0e..670ed1d 100644 --- a/server/tests/unit/domain/record/test_record_published_enriched.py +++ b/server/tests/unit/domain/record/test_record_published_enriched.py @@ -8,7 +8,10 @@ from osa.domain.record.event.record_published import RecordPublished from osa.domain.shared.event import EventId from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId + + +SCHEMA = SchemaId.parse("test@1.0.0") class TestRecordPublishedEnriched: @@ -20,6 +23,7 @@ def test_carries_source(self): source=source, metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=SCHEMA, expected_features=["pocketeer"], ) assert event.source.type == "deposition" @@ -32,6 +36,7 @@ def test_carries_convention_srn(self): source=DepositionSource(id="urn:osa:localhost:dep:test"), metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=SCHEMA, expected_features=[], ) assert event.convention_srn == ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") @@ -43,6 +48,7 @@ def test_carries_expected_features(self): source=DepositionSource(id="urn:osa:localhost:dep:test"), metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=SCHEMA, expected_features=["pocketeer", "qc_check"], ) assert event.expected_features == ["pocketeer", "qc_check"] diff --git a/server/tests/unit/domain/record/test_record_service.py b/server/tests/unit/domain/record/test_record_service.py index e62c2cd..dbeafc0 100644 --- a/server/tests/unit/domain/record/test_record_service.py +++ b/server/tests/unit/domain/record/test_record_service.py @@ -1,19 +1,26 @@ """Unit tests for RecordService.""" +from datetime import UTC, datetime from unittest.mock import AsyncMock, MagicMock from uuid import uuid4 import pytest +from osa.domain.deposition.model.convention import Convention +from osa.domain.deposition.model.value import FileRequirements +from osa.domain.deposition.port.convention_repository import ConventionRepository from osa.domain.record.event.record_published import RecordPublished from osa.domain.record.model.draft import RecordDraft from osa.domain.record.port.repository import RecordRepository from osa.domain.record.service.record import RecordService -from osa.domain.shared.model.source import ( - DepositionSource, - IngestSource, +from osa.domain.shared.model.source import DepositionSource, IngestSource +from osa.domain.shared.model.srn import ( + ConventionSRN, + DepositionSRN, + Domain, + LocalId, + SchemaId, ) -from osa.domain.shared.model.srn import ConventionSRN, DepositionSRN, Domain, LocalId from osa.domain.shared.outbox import Outbox @@ -21,6 +28,22 @@ def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") + + +def _make_convention() -> Convention: + return Convention( + srn=_make_conv_srn(), + title="Test Convention", + description=None, + schema_id=_make_schema_id(), + file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), + hooks=[], + created_at=datetime.now(UTC), + ) + + @pytest.fixture def mock_record_repo() -> RecordRepository: repo = MagicMock(spec=RecordRepository) @@ -28,6 +51,13 @@ def mock_record_repo() -> RecordRepository: return repo +@pytest.fixture +def mock_convention_repo() -> ConventionRepository: + repo = MagicMock(spec=ConventionRepository) + repo.get = AsyncMock(return_value=_make_convention()) + return repo + + @pytest.fixture def mock_outbox() -> Outbox: outbox = MagicMock(spec=Outbox) @@ -51,28 +81,40 @@ def sample_draft(node_domain: Domain) -> RecordDraft: ) +def _make_service( + record_repo: RecordRepository, + convention_repo: ConventionRepository, + outbox: Outbox, + node_domain: Domain, +) -> RecordService: + return RecordService( + record_repo=record_repo, + convention_repo=convention_repo, + metadata_service=AsyncMock(), + outbox=outbox, + node_domain=node_domain, + feature_reader=AsyncMock(), + ) + + class TestRecordService: @pytest.mark.asyncio async def test_publish_record_creates_record( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, sample_draft: RecordDraft, ): - """Service should create and persist a Record from a draft.""" - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(sample_draft) assert record is not None assert record.source == sample_draft.source assert record.convention_srn == sample_draft.convention_srn + assert record.schema_id == _make_schema_id() assert record.metadata == sample_draft.metadata mock_record_repo.save.assert_called_once() @@ -80,17 +122,12 @@ async def test_publish_record_creates_record( async def test_publish_record_emits_record_published_event( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, sample_draft: RecordDraft, ): - """Service should emit RecordPublished event with source-agnostic fields.""" - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(sample_draft) @@ -100,6 +137,7 @@ async def test_publish_record_emits_record_published_event( assert event.record_srn == record.srn assert event.source == sample_draft.source assert event.convention_srn == sample_draft.convention_srn + assert event.schema_id == _make_schema_id() assert event.expected_features == sample_draft.expected_features assert event.metadata == sample_draft.metadata @@ -107,17 +145,12 @@ async def test_publish_record_emits_record_published_event( async def test_publish_record_creates_version_1( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, sample_draft: RecordDraft, ): - """New records should be version 1.""" - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(sample_draft) @@ -125,16 +158,14 @@ async def test_publish_record_creates_version_1( class TestRecordServiceIngestSource: - """US2: Verify ingest-sourced records publish correctly.""" - @pytest.mark.asyncio async def test_publish_with_ingest_source( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, ): - """IngestSource draft produces correct Record + RecordPublished event.""" draft = RecordDraft( source=IngestSource( id="run-123-pdb-456", @@ -146,12 +177,7 @@ async def test_publish_with_ingest_source( expected_features=["pocket_detect"], ) - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(draft) diff --git a/server/tests/unit/domain/semantics/test_schema.py b/server/tests/unit/domain/semantics/test_schema.py index 0a245ab..c853f3d 100644 --- a/server/tests/unit/domain/semantics/test_schema.py +++ b/server/tests/unit/domain/semantics/test_schema.py @@ -14,11 +14,11 @@ TextConstraints, ) from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import OntologySRN, SchemaSRN +from osa.domain.shared.model.srn import OntologySRN, SchemaId -def _make_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_text_field(name: str = "title", required: bool = True) -> FieldDefinition: @@ -33,7 +33,7 @@ def _make_text_field(name: str = "title", required: bool = True) -> FieldDefinit class TestSchemaCreation: def test_create_with_single_field(self): schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="Test Schema", fields=[_make_text_field()], created_at=datetime.now(UTC), @@ -43,7 +43,7 @@ def test_create_with_single_field(self): def test_create_with_multiple_fields(self): schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="scRNA-seq", fields=[ _make_text_field("title"), @@ -62,7 +62,7 @@ def test_create_with_multiple_fields(self): def test_create_with_ontology_reference(self): onto_srn = OntologySRN.parse("urn:osa:localhost:onto:sex@1.0.0") schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="With Ontology", fields=[ FieldDefinition( @@ -79,7 +79,7 @@ def test_create_with_ontology_reference(self): def test_create_with_text_constraints(self): schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="Constrained", fields=[ FieldDefinition( @@ -99,7 +99,7 @@ class TestSchemaInvariants: def test_rejects_empty_fields(self): with pytest.raises(ValidationError, match="at least one field"): Schema( - srn=_make_srn(), + id=_make_srn(), title="Empty", fields=[], created_at=datetime.now(UTC), @@ -108,7 +108,7 @@ def test_rejects_empty_fields(self): def test_rejects_duplicate_field_names(self): with pytest.raises(ValidationError, match="Duplicate field names"): Schema( - srn=_make_srn(), + id=_make_srn(), title="Duplicate", fields=[ _make_text_field("title"), diff --git a/server/tests/unit/domain/semantics/test_schema_service.py b/server/tests/unit/domain/semantics/test_schema_service.py index 71fbaf4..a87d163 100644 --- a/server/tests/unit/domain/semantics/test_schema_service.py +++ b/server/tests/unit/domain/semantics/test_schema_service.py @@ -13,12 +13,12 @@ TermConstraints, ) from osa.domain.semantics.service.schema import SchemaService -from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import Domain, OntologySRN, SchemaSRN +from osa.domain.shared.error import ConflictError, NotFoundError, ValidationError +from osa.domain.shared.model.srn import Domain, OntologySRN, SchemaId, SchemaIdentifier -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_ontology_srn(id: str = "sex", version: str = "1.0.0") -> OntologySRN: @@ -48,6 +48,7 @@ class TestSchemaServiceCreate: @pytest.mark.asyncio async def test_create_schema_without_ontology_refs(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() service = SchemaService( @@ -56,16 +57,19 @@ async def test_create_schema_without_ontology_refs(self): node_domain=Domain("localhost"), ) result = await service.create_schema( + id=SchemaIdentifier("simple-schema"), title="Simple Schema", version="1.0.0", fields=[_make_text_field()], ) assert result.title == "Simple Schema" + assert result.id.id.root == "simple-schema" schema_repo.save.assert_called_once() @pytest.mark.asyncio async def test_create_schema_with_valid_ontology_ref(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() ontology_repo.exists.return_value = True @@ -75,6 +79,7 @@ async def test_create_schema_with_valid_ontology_ref(self): node_domain=Domain("localhost"), ) result = await service.create_schema( + id=SchemaIdentifier("with-ontology"), title="With Ontology", version="1.0.0", fields=[_make_text_field(), _make_term_field()], @@ -85,6 +90,7 @@ async def test_create_schema_with_valid_ontology_ref(self): @pytest.mark.asyncio async def test_create_schema_rejects_invalid_ontology_ref(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() ontology_repo.exists.return_value = False @@ -95,14 +101,16 @@ async def test_create_schema_rejects_invalid_ontology_ref(self): ) with pytest.raises(ValidationError, match="Ontology.*not found"): await service.create_schema( + id=SchemaIdentifier("bad-ref"), title="Bad Ref", version="1.0.0", fields=[_make_term_field()], ) @pytest.mark.asyncio - async def test_create_schema_generates_srn(self): + async def test_create_schema_uses_supplied_id(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() service = SchemaService( @@ -111,19 +119,67 @@ async def test_create_schema_generates_srn(self): node_domain=Domain("localhost"), ) result = await service.create_schema( - title="Test", + id=SchemaIdentifier("pdb-structure"), + title="PDB Structures", version="1.0.0", fields=[_make_text_field()], ) - assert str(result.srn).startswith("urn:osa:localhost:schema:") - assert str(result.srn).endswith("@1.0.0") + assert str(result.id) == "pdb-structure@1.0.0" + + @pytest.mark.asyncio + async def test_duplicate_id_version_raises_conflict(self): + schema_repo = AsyncMock() + existing_schema = Schema( + id=SchemaId.parse("dup@1.0.0"), + title="Existing", + fields=[_make_text_field()], + created_at=datetime.now(UTC), + ) + schema_repo.get.return_value = existing_schema + ontology_repo = AsyncMock() + + service = SchemaService( + schema_repo=schema_repo, + ontology_repo=ontology_repo, + node_domain=Domain("localhost"), + ) + with pytest.raises(ConflictError) as exc: + await service.create_schema( + id=SchemaIdentifier("dup"), + title="Dup", + version="1.0.0", + fields=[_make_text_field()], + ) + assert exc.value.code == "schema_already_exists" + schema_repo.save.assert_not_called() + + +class TestSchemaIdentifierValidation: + def test_rejects_leading_digit(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("3d-scan") + + def test_rejects_uppercase(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("PDBStructure") + + def test_rejects_too_short(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("ab") + + def test_rejects_underscore(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("pdb_structure") + + def test_accepts_hyphens_and_digits(self): + assert SchemaIdentifier("pdb-v2").root == "pdb-v2" class TestSchemaServiceGet: @pytest.mark.asyncio async def test_get_existing(self): schema = Schema( - srn=_make_schema_srn(), + id=_make_schema_id(), title="Test", fields=[_make_text_field()], created_at=datetime.now(UTC), @@ -137,7 +193,7 @@ async def test_get_existing(self): ontology_repo=ontology_repo, node_domain=Domain("localhost"), ) - result = await service.get_schema(schema.srn) + result = await service.get_schema(schema.id) assert result == schema @pytest.mark.asyncio @@ -152,14 +208,14 @@ async def test_get_nonexistent_raises(self): node_domain=Domain("localhost"), ) with pytest.raises(NotFoundError): - await service.get_schema(_make_schema_srn()) + await service.get_schema(_make_schema_id()) class TestSchemaServiceList: @pytest.mark.asyncio async def test_list_schemas(self): schema = Schema( - srn=_make_schema_srn(), + id=_make_schema_id(), title="Test", fields=[_make_text_field()], created_at=datetime.now(UTC), diff --git a/server/tests/unit/domain/shared/test_hook_models.py b/server/tests/unit/domain/shared/test_hook_models.py index cf669b0..e7fc144 100644 --- a/server/tests/unit/domain/shared/test_hook_models.py +++ b/server/tests/unit/domain/shared/test_hook_models.py @@ -359,3 +359,38 @@ def test_valid_names_accepted(self): col = ColumnDef(name=name, json_type="number", required=True) assert col.name == name + + def test_hook_name_accepts_40_chars(self): + """Hook names must fit in derived identifiers like + ``fk_features_{name}_record_srn`` — 23 chars overhead + up to 40-char + hook = 63-char max, which is PG's identifier limit.""" + from osa.domain.shared.model.hook import HookDefinition, OciConfig, TableFeatureSpec + + forty = "a" + "b" * 39 + hook = HookDefinition( + name=forty, + runtime=OciConfig(image="img:v1", digest="sha256:abc"), + feature=TableFeatureSpec(cardinality="one", columns=[]), + ) + assert hook.name == forty + + def test_hook_name_rejects_over_40_chars(self): + """41+ char names would produce an FK name exceeding PG's 63-char + identifier limit.""" + from osa.domain.shared.model.hook import HookDefinition, OciConfig, TableFeatureSpec + + with pytest.raises(ValidationError): + HookDefinition( + name="a" + "b" * 40, # 41 chars + runtime=OciConfig(image="img:v1", digest="sha256:abc"), + feature=TableFeatureSpec(cardinality="one", columns=[]), + ) + + def test_column_name_still_accepts_63_chars(self): + """ColumnDef uses plain PgIdentifier — columns don't compose into + longer derived identifiers, so the full 63-char PG limit is fine.""" + from osa.domain.shared.model.hook import ColumnDef + + sixty_three = "a" + "b" * 62 + col = ColumnDef(name=sixty_three, json_type="number", required=True) + assert col.name == sixty_three diff --git a/server/tests/unit/domain/shared/test_srn.py b/server/tests/unit/domain/shared/test_srn.py index 669c5ed..df47011 100644 --- a/server/tests/unit/domain/shared/test_srn.py +++ b/server/tests/unit/domain/shared/test_srn.py @@ -3,7 +3,7 @@ SRN, RecordSRN, DepositionSRN, - SchemaSRN, + SchemaId, ResourceType, ) @@ -17,12 +17,12 @@ def test_parse_record_srn(self): assert srn.version is not None assert srn.version.root == 1 - def test_parse_schema_srn(self): - raw = "urn:osa:node-1:schema:my-schema@1.0.0" - srn = SchemaSRN.parse(raw) - assert srn.type == ResourceType.schema - assert srn.id.root == "my-schema" - assert str(srn.version) == "1.0.0" + def test_parse_schema_id(self): + sid = SchemaId.parse("my-schema@1.0.0") + assert sid.id.root == "my-schema" + assert str(sid.version) == "1.0.0" + assert sid.major == 1 + assert sid.render() == "my-schema@1.0.0" def test_render_srn(self): srn = DepositionSRN.parse("urn:osa:node-1:dep:abc-123") diff --git a/server/tests/unit/infrastructure/persistence/test_metadata_store_coerce.py b/server/tests/unit/infrastructure/persistence/test_metadata_store_coerce.py new file mode 100644 index 0000000..833e636 --- /dev/null +++ b/server/tests/unit/infrastructure/persistence/test_metadata_store_coerce.py @@ -0,0 +1,73 @@ +"""Unit tests for ``_coerce_value`` — ensures bad JSONB values surface as +``ValidationError`` (→ 400) instead of propagating raw ``ValueError`` (→ 500). +""" + +from datetime import date, datetime + +import pytest + +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.hook import ColumnDef +from osa.infrastructure.persistence.metadata_store import _coerce_value + + +def _date_col(name: str = "collected_on") -> ColumnDef: + return ColumnDef(name=name, json_type="string", format="date", required=False) + + +def _datetime_col(name: str = "measured_at") -> ColumnDef: + return ColumnDef(name=name, json_type="string", format="date-time", required=False) + + +class TestCoerceValueDate: + def test_parses_iso_date_string(self): + assert _coerce_value(_date_col(), "2026-04-23") == date(2026, 4, 23) + + def test_passes_through_date(self): + d = date(2026, 4, 23) + assert _coerce_value(_date_col(), d) is d + + def test_none_passes_through(self): + assert _coerce_value(_date_col(), None) is None + + def test_malformed_iso_date_raises_validation_error(self): + with pytest.raises(ValidationError) as exc_info: + _coerce_value(_date_col("collected_on"), "2026-99-99") + assert exc_info.value.field == "collected_on" + assert "ISO-8601 date" in str(exc_info.value) + + def test_non_string_non_date_raises_validation_error(self): + with pytest.raises(ValidationError) as exc_info: + _coerce_value(_date_col(), 12345) + assert exc_info.value.field == "collected_on" + + def test_includes_record_srn_in_error_when_provided(self): + with pytest.raises(ValidationError, match="record urn:osa:localhost:rec:abc@1"): + _coerce_value(_date_col(), "not-a-date", record_srn="urn:osa:localhost:rec:abc@1") + + +class TestCoerceValueDatetime: + def test_parses_iso_datetime_string(self): + assert _coerce_value(_datetime_col(), "2026-04-23T10:30:00") == datetime( + 2026, 4, 23, 10, 30, 0 + ) + + def test_passes_through_datetime(self): + dt = datetime(2026, 4, 23, 10, 30, 0) + assert _coerce_value(_datetime_col(), dt) is dt + + def test_malformed_iso_datetime_raises_validation_error(self): + with pytest.raises(ValidationError) as exc_info: + _coerce_value(_datetime_col("measured_at"), "not-a-datetime") + assert exc_info.value.field == "measured_at" + assert "ISO-8601 date-time" in str(exc_info.value) + + +class TestCoerceValueOther: + def test_text_passthrough(self): + col = ColumnDef(name="species", json_type="string", format=None, required=False) + assert _coerce_value(col, "Homo sapiens") == "Homo sapiens" + + def test_number_passthrough(self): + col = ColumnDef(name="resolution", json_type="number", format=None, required=False) + assert _coerce_value(col, 1.5) == 1.5 diff --git a/server/tests/unit/test_field_ref_resolution.py b/server/tests/unit/test_field_ref_resolution.py new file mode 100644 index 0000000..d647c3a --- /dev/null +++ b/server/tests/unit/test_field_ref_resolution.py @@ -0,0 +1,42 @@ +"""US3 tests: typed field-reference parsing and tree validation.""" + +import pytest + +from osa.domain.discovery.model.refs import ( + FeatureFieldRef, + MetadataFieldRef, + parse_field_ref, +) + + +class TestParseFieldRef: + def test_parses_metadata_ref(self): + ref = parse_field_ref("metadata.species") + assert isinstance(ref, MetadataFieldRef) + assert ref.field == "species" + + def test_parses_feature_ref(self): + ref = parse_field_ref("features.cell_classifier.confidence") + assert isinstance(ref, FeatureFieldRef) + assert ref.hook == "cell_classifier" + assert ref.column == "confidence" + + def test_rejects_unknown_prefix(self): + with pytest.raises(ValueError, match="prefix"): + parse_field_ref("other.foo") + + def test_rejects_malformed_metadata(self): + with pytest.raises(ValueError): + parse_field_ref("metadata.a.b") + + def test_rejects_malformed_feature(self): + with pytest.raises(ValueError): + parse_field_ref("features.hook") + + def test_rejects_invalid_identifier(self): + with pytest.raises(ValueError): + parse_field_ref("metadata.Has-Dash") + + def test_dotted_round_trip(self): + assert parse_field_ref("metadata.species").dotted() == "metadata.species" + assert parse_field_ref("features.hook.col").dotted() == "features.hook.col" diff --git a/server/tests/unit/test_filter_expr_and_compile.py b/server/tests/unit/test_filter_expr_and_compile.py new file mode 100644 index 0000000..41a2ba9 --- /dev/null +++ b/server/tests/unit/test_filter_expr_and_compile.py @@ -0,0 +1,197 @@ +"""US1 tests: FilterExpr AND-tree validation via DiscoveryService bounds.""" + +from unittest.mock import AsyncMock + +import pytest + +from osa.config import Config +from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef +from osa.domain.discovery.model.value import ( + And, + FilterOperator, + Predicate, + SortOrder, +) +from osa.domain.discovery.service.discovery import DiscoveryService +from osa.domain.semantics.model.value import FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaId + + +SCHEMA = SchemaId.parse("bio-sample@1.0.0") + + +def _config(overrides: dict | None = None) -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + cfg = Config() # type: ignore[call-arg] + if overrides: + for k, v in overrides.items(): + setattr(cfg, k, v) + return cfg + + +def _svc( + *, + field_map: dict[str, FieldType] | None = None, + max_depth: int | None = None, + max_preds: int | None = None, + max_joins: int | None = None, +) -> DiscoveryService: + read_store = AsyncMock() + read_store.search_records.return_value = [] + read_store.get_feature_catalog.return_value = [] + + reader = AsyncMock() + fm = field_map or { + "title": FieldType.TEXT, + "resolution": FieldType.NUMBER, + } + reader.get_all_field_types.return_value = fm + reader.get_fields_for_schema.return_value = fm + + overrides = {} + if max_depth is not None: + overrides["discovery_max_filter_depth"] = max_depth + if max_preds is not None: + overrides["discovery_max_predicates"] = max_preds + if max_joins is not None: + overrides["discovery_max_cross_domain_joins"] = max_joins + + return DiscoveryService(read_store=read_store, field_reader=reader, config=_config(overrides)) + + +def _pred(field: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=op, value=value) + + +class TestAndOnlyTrees: + async def test_accepts_and_of_predicates(self) -> None: + svc = _svc() + tree = And( + operands=[ + _pred("title", FilterOperator.EQ, "x"), + _pred("resolution", FilterOperator.GTE, 3.0), + ] + ) + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + +class TestBoundsEnforced: + async def test_depth_exceeded(self) -> None: + svc = _svc(max_depth=3) + leaf = _pred("title", FilterOperator.EQ, "x") + tree = leaf + for _ in range(4): + tree = And(operands=[tree, leaf]) + + with pytest.raises(ValidationError, match="depth"): + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + async def test_predicates_exceeded(self) -> None: + svc = _svc(max_preds=2) + tree = And( + operands=[ + _pred("title", FilterOperator.EQ, "x"), + _pred("title", FilterOperator.EQ, "y"), + _pred("resolution", FilterOperator.GTE, 3.0), + ] + ) + with pytest.raises(ValidationError, match="predicate leaves"): + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + async def test_joins_exceeded(self) -> None: + svc = _svc(max_joins=1) + # Simulate catalog advertising multiple hooks with a column named score + svc.read_store.get_feature_catalog.return_value = [ # type: ignore[attr-defined] + type( + "E", + (), + { + "hook_name": "hook_a", + "columns": [ + type("C", (), {"name": "score", "type": "number", "required": True}) + ], + }, + ), + type( + "E", + (), + { + "hook_name": "hook_b", + "columns": [ + type("C", (), {"name": "score", "type": "number", "required": True}) + ], + }, + ), + ] + tree = And( + operands=[ + Predicate( + field=FeatureFieldRef(hook="hook_a", column="score"), + op=FilterOperator.GT, + value=0.0, + ), + Predicate( + field=FeatureFieldRef(hook="hook_b", column="score"), + op=FilterOperator.GT, + value=0.0, + ), + ] + ) + with pytest.raises(ValidationError, match="distinct feature hooks"): + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + +class TestUnknownField: + async def test_unknown_metadata_field_rejected(self) -> None: + svc = _svc() + with pytest.raises(ValidationError, match="Unknown metadata field"): + await svc.search_records( + filter_expr=_pred("bogus", FilterOperator.EQ, "x"), + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) diff --git a/server/tests/unit/test_filter_expr_or_not.py b/server/tests/unit/test_filter_expr_or_not.py new file mode 100644 index 0000000..191bc74 --- /dev/null +++ b/server/tests/unit/test_filter_expr_or_not.py @@ -0,0 +1,131 @@ +"""US2 tests: FilterExpr accepts OR/NOT trees and validation walks them correctly.""" + +from unittest.mock import AsyncMock + +import pytest + +from osa.config import Config +from osa.domain.discovery.model.refs import MetadataFieldRef +from osa.domain.discovery.model.value import ( + And, + FilterOperator, + Not, + Or, + Predicate, + SortOrder, +) +from osa.domain.discovery.service.discovery import DiscoveryService +from osa.domain.semantics.model.value import FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaId + + +SCHEMA = SchemaId.parse("bio-sample@1.0.0") + + +def _config() -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] + + +def _svc() -> DiscoveryService: + read_store = AsyncMock() + read_store.search_records.return_value = [] + reader = AsyncMock() + fm = { + "title": FieldType.TEXT, + "resolution": FieldType.NUMBER, + } + reader.get_all_field_types.return_value = fm + reader.get_fields_for_schema.return_value = fm + return DiscoveryService(read_store=read_store, field_reader=reader, config=_config()) + + +def _pred(field: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=op, value=value) + + +class TestOrNot: + async def test_or_tree_accepted(self): + svc = _svc() + tree = Or( + operands=[ + _pred("title", FilterOperator.EQ, "A"), + _pred("title", FilterOperator.EQ, "B"), + ] + ) + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + ) + + async def test_not_tree_accepted(self): + svc = _svc() + tree = Not(operand=_pred("title", FilterOperator.EQ, "X")) + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + ) + + async def test_nested_mixed_tree(self): + svc = _svc() + tree = And( + operands=[ + _pred("title", FilterOperator.EQ, "X"), + Or( + operands=[ + _pred("resolution", FilterOperator.GTE, 3.0), + _pred("resolution", FilterOperator.LT, 1.0), + ] + ), + Not(operand=_pred("title", FilterOperator.EQ, "Bad")), + ] + ) + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + ) + + +class TestCompoundDisabledFlag: + async def test_or_rejected_when_compound_disabled(self): + svc = _svc() + tree = Or( + operands=[ + _pred("title", FilterOperator.EQ, "A"), + _pred("title", FilterOperator.EQ, "B"), + ] + ) + with pytest.raises(ValidationError, match="compound_disabled|Compound"): + await svc.search_records( + filter_expr=tree, + schema_id=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + allow_compound=False, + ) diff --git a/server/tests/unit/test_metadata_column_mapper.py b/server/tests/unit/test_metadata_column_mapper.py new file mode 100644 index 0000000..74e6ff6 --- /dev/null +++ b/server/tests/unit/test_metadata_column_mapper.py @@ -0,0 +1,35 @@ +"""Tests for metadata column mapping — reuses the shared column_mapper.""" + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB + +from osa.domain.shared.model.hook import ColumnDef +from osa.infrastructure.persistence.column_mapper import map_column + + +class TestScalarTypes: + def test_text(self): + col = map_column(ColumnDef(name="title", json_type="string", required=True)) + assert isinstance(col.type, sa.Text) + assert col.nullable is False + + def test_number(self): + col = map_column(ColumnDef(name="resolution", json_type="number", required=True)) + assert isinstance(col.type, sa.Float) + + def test_integer(self): + col = map_column(ColumnDef(name="count", json_type="integer", required=True)) + assert isinstance(col.type, sa.BigInteger) + + def test_boolean(self): + col = map_column(ColumnDef(name="ok", json_type="boolean", required=False)) + assert isinstance(col.type, sa.Boolean) + assert col.nullable is True + + def test_date(self): + col = map_column(ColumnDef(name="d", json_type="string", format="date", required=False)) + assert isinstance(col.type, sa.Date) + + def test_array_jsonb(self): + col = map_column(ColumnDef(name="tags", json_type="array", required=False)) + assert isinstance(col.type, JSONB) diff --git a/server/tests/unit/test_metadata_service.py b/server/tests/unit/test_metadata_service.py new file mode 100644 index 0000000..ebc0ed7 --- /dev/null +++ b/server/tests/unit/test_metadata_service.py @@ -0,0 +1,39 @@ +"""MetadataService unit tests — thin delegator over MetadataStore.""" + +from unittest.mock import AsyncMock + +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaId + +SCHEMA = SchemaId.parse("bio-sample@1.0.0") +RECORD = RecordSRN.parse("urn:osa:localhost:rec:abc@1") + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ) + ] + + +class TestMetadataService: + async def test_ensure_table_delegates(self): + store = AsyncMock() + svc = MetadataService(metadata_store=store) + await svc.ensure_table(schema_id=SCHEMA, fields=_fields()) + store.ensure_table.assert_called_once() + + async def test_insert_delegates(self): + store = AsyncMock() + svc = MetadataService(metadata_store=store) + await svc.insert( + schema_id=SCHEMA, + record_srn=RECORD, + values={"species": "Homo sapiens"}, + ) + store.insert.assert_called_once() diff --git a/server/tests/unit/test_metadata_slug.py b/server/tests/unit/test_metadata_slug.py new file mode 100644 index 0000000..ea0abf1 --- /dev/null +++ b/server/tests/unit/test_metadata_slug.py @@ -0,0 +1,63 @@ +"""Tests for schema_slug() — pg-safe slug derivation from Schema title.""" + +import pytest + +from osa.infrastructure.persistence.metadata_table import ( + PG_IDENT_MAX_LEN, + check_pg_table_name, + schema_slug, +) + + +class TestSchemaSlug: + def test_accepts_simple_title(self): + assert schema_slug("bio_sample") == "bio_sample" + + def test_lowercases_camel_case(self): + assert schema_slug("BioSample") == "biosample" + + def test_replaces_spaces_with_underscore(self): + assert schema_slug("bio sample") == "bio_sample" + + def test_replaces_punctuation_with_underscore(self): + assert schema_slug("bio-sample.v2") == "bio_sample_v2" + + def test_collapses_repeated_non_alnum(self): + assert schema_slug("bio---sample") == "bio_sample" + + def test_strips_leading_and_trailing_underscores(self): + assert schema_slug("__bio_sample__") == "bio_sample" + + def test_is_stable_across_invocations(self): + assert schema_slug("BioSample v1") == schema_slug("BioSample v1") + + def test_rejects_empty_title(self): + with pytest.raises(ValueError): + schema_slug("") + + def test_rejects_title_with_only_punctuation(self): + with pytest.raises(ValueError): + schema_slug("!!!") + + def test_rejects_title_starting_with_digit(self): + with pytest.raises(ValueError): + schema_slug("1bio_sample") + + def test_accepts_max_length_schema_identifier(self): + """SchemaIdentifier allows 64-char ids; slug must not reject them.""" + long_id = "a" + "b" * 63 # 64 chars, matches SchemaIdentifier upper bound + assert schema_slug(long_id) == long_id + + def test_rejects_over_max_length(self): + with pytest.raises(ValueError): + schema_slug("a" + "b" * 64) # 65 chars + + +class TestCheckPgTableName: + def test_accepts_table_name_at_pg_limit(self): + name = "a" * PG_IDENT_MAX_LEN + check_pg_table_name(name) # no raise + + def test_rejects_table_name_over_pg_limit(self): + with pytest.raises(ValueError, match="exceeds PG's"): + check_pg_table_name("a" * (PG_IDENT_MAX_LEN + 1)) diff --git a/server/tests/unit/test_record_schema_srn_immutable.py b/server/tests/unit/test_record_schema_srn_immutable.py new file mode 100644 index 0000000..13e5c7b --- /dev/null +++ b/server/tests/unit/test_record_schema_srn_immutable.py @@ -0,0 +1,28 @@ +"""FR-008: Record.schema_id is immutable after construction.""" + +from datetime import UTC, datetime + +import pytest +from pydantic import ValidationError + +from osa.domain.record.model.aggregate import Record +from osa.domain.shared.model.source import DepositionSource +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId + + +def _make_record() -> Record: + return Record( + srn=RecordSRN.parse("urn:osa:localhost:rec:abc@1"), + source=DepositionSource(id="urn:osa:localhost:dep:d1"), + convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_id=SchemaId.parse("test@1.0.0"), + metadata={"title": "T"}, + published_at=datetime.now(UTC), + ) + + +def test_schema_id_cannot_be_reassigned(): + record = _make_record() + other = SchemaId.parse("other@1.0.0") + with pytest.raises(ValidationError): + record.schema_id = other # type: ignore[misc] diff --git a/server/uv.lock b/server/uv.lock index 79388c5..0e39f48 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -277,6 +277,7 @@ dependencies = [ { name = "jmespath", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "s3transfer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/74/ec/636ab2aa7ad9e6bf6e297240ac2d44dba63cc6611e2d5038db318436d449/boto3-1.42.74.tar.gz", hash = "sha256:dbacd808cf2a3dadbf35f3dbd8de97b94dc9f78b1ebd439f38f552e0f9753577", size = 112739, upload-time = "2026-03-23T19:34:09.815Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ad/16/a264b4da2af99f4a12609b93fea941cce5ec41da14b33ed3fef77a910f0c/boto3-1.42.74-py3-none-any.whl", hash = "sha256:4bf89c044d618fe4435af854ab820f09dd43569c0df15d7beb0398f50b9aa970", size = 140557, upload-time = "2026-03-23T19:34:07.084Z" }, ]